1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "assembler_x86.inline.hpp"
  27 #include "gc_interface/collectedHeap.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "memory/cardTableModRefBS.hpp"
  30 #include "memory/resourceArea.hpp"
  31 #include "prims/methodHandles.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/interfaceSupport.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #ifndef SERIALGC
  39 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  40 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  41 #include "gc_implementation/g1/heapRegion.hpp"
  42 #endif
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  53 // Implementation of AddressLiteral
  54 
  55 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  56   _is_lval = false;
  57   _target = target;
  58   switch (rtype) {
  59   case relocInfo::oop_type:
  60   case relocInfo::metadata_type:
  61     // Oops are a special case. Normally they would be their own section
  62     // but in cases like icBuffer they are literals in the code stream that
  63     // we don't have a section for. We use none so that we get a literal address
  64     // which is always patchable.
  65     break;
  66   case relocInfo::external_word_type:
  67     _rspec = external_word_Relocation::spec(target);
  68     break;
  69   case relocInfo::internal_word_type:
  70     _rspec = internal_word_Relocation::spec(target);
  71     break;
  72   case relocInfo::opt_virtual_call_type:
  73     _rspec = opt_virtual_call_Relocation::spec();
  74     break;
  75   case relocInfo::static_call_type:
  76     _rspec = static_call_Relocation::spec();
  77     break;
  78   case relocInfo::runtime_call_type:
  79     _rspec = runtime_call_Relocation::spec();
  80     break;
  81   case relocInfo::poll_type:
  82   case relocInfo::poll_return_type:
  83     _rspec = Relocation::spec_simple(rtype);
  84     break;
  85   case relocInfo::none:
  86     break;
  87   default:
  88     ShouldNotReachHere();
  89     break;
  90   }
  91 }
  92 
  93 // Implementation of Address
  94 
  95 #ifdef _LP64
  96 
  97 Address Address::make_array(ArrayAddress adr) {
  98   // Not implementable on 64bit machines
  99   // Should have been handled higher up the call chain.
 100   ShouldNotReachHere();
 101   return Address();
 102 }
 103 
 104 // exceedingly dangerous constructor
 105 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
 106   _base  = noreg;
 107   _index = noreg;
 108   _scale = no_scale;
 109   _disp  = disp;
 110   switch (rtype) {
 111     case relocInfo::external_word_type:
 112       _rspec = external_word_Relocation::spec(loc);
 113       break;
 114     case relocInfo::internal_word_type:
 115       _rspec = internal_word_Relocation::spec(loc);
 116       break;
 117     case relocInfo::runtime_call_type:
 118       // HMM
 119       _rspec = runtime_call_Relocation::spec();
 120       break;
 121     case relocInfo::poll_type:
 122     case relocInfo::poll_return_type:
 123       _rspec = Relocation::spec_simple(rtype);
 124       break;
 125     case relocInfo::none:
 126       break;
 127     default:
 128       ShouldNotReachHere();
 129   }
 130 }
 131 #else // LP64
 132 
 133 Address Address::make_array(ArrayAddress adr) {
 134   AddressLiteral base = adr.base();
 135   Address index = adr.index();
 136   assert(index._disp == 0, "must not have disp"); // maybe it can?
 137   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
 138   array._rspec = base._rspec;
 139   return array;
 140 }
 141 
 142 // exceedingly dangerous constructor
 143 Address::Address(address loc, RelocationHolder spec) {
 144   _base  = noreg;
 145   _index = noreg;
 146   _scale = no_scale;
 147   _disp  = (intptr_t) loc;
 148   _rspec = spec;
 149 }
 150 
 151 #endif // _LP64
 152 
 153 
 154 
 155 // Convert the raw encoding form into the form expected by the constructor for
 156 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 157 // that to noreg for the Address constructor.
 158 Address Address::make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc) {
 159   RelocationHolder rspec;
 160   if (disp_reloc != relocInfo::none) {
 161     rspec = Relocation::spec_simple(disp_reloc);
 162   }
 163   bool valid_index = index != rsp->encoding();
 164   if (valid_index) {
 165     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
 166     madr._rspec = rspec;
 167     return madr;
 168   } else {
 169     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
 170     madr._rspec = rspec;
 171     return madr;
 172   }
 173 }
 174 
 175 // Implementation of Assembler
 176 
 177 int AbstractAssembler::code_fill_byte() {
 178   return (u_char)'\xF4'; // hlt
 179 }
 180 
 181 // make this go away someday
 182 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
 183   if (rtype == relocInfo::none)
 184         emit_long(data);
 185   else  emit_data(data, Relocation::spec_simple(rtype), format);
 186 }
 187 
 188 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
 189   assert(imm_operand == 0, "default format must be immediate in this file");
 190   assert(inst_mark() != NULL, "must be inside InstructionMark");
 191   if (rspec.type() !=  relocInfo::none) {
 192     #ifdef ASSERT
 193       check_relocation(rspec, format);
 194     #endif
 195     // Do not use AbstractAssembler::relocate, which is not intended for
 196     // embedded words.  Instead, relocate to the enclosing instruction.
 197 
 198     // hack. call32 is too wide for mask so use disp32
 199     if (format == call32_operand)
 200       code_section()->relocate(inst_mark(), rspec, disp32_operand);
 201     else
 202       code_section()->relocate(inst_mark(), rspec, format);
 203   }
 204   emit_long(data);
 205 }
 206 
 207 static int encode(Register r) {
 208   int enc = r->encoding();
 209   if (enc >= 8) {
 210     enc -= 8;
 211   }
 212   return enc;
 213 }
 214 
 215 static int encode(XMMRegister r) {
 216   int enc = r->encoding();
 217   if (enc >= 8) {
 218     enc -= 8;
 219   }
 220   return enc;
 221 }
 222 
 223 void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
 224   assert(dst->has_byte_register(), "must have byte register");
 225   assert(isByte(op1) && isByte(op2), "wrong opcode");
 226   assert(isByte(imm8), "not a byte");
 227   assert((op1 & 0x01) == 0, "should be 8bit operation");
 228   emit_byte(op1);
 229   emit_byte(op2 | encode(dst));
 230   emit_byte(imm8);
 231 }
 232 
 233 
 234 void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
 235   assert(isByte(op1) && isByte(op2), "wrong opcode");
 236   assert((op1 & 0x01) == 1, "should be 32bit operation");
 237   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 238   if (is8bit(imm32)) {
 239     emit_byte(op1 | 0x02); // set sign bit
 240     emit_byte(op2 | encode(dst));
 241     emit_byte(imm32 & 0xFF);
 242   } else {
 243     emit_byte(op1);
 244     emit_byte(op2 | encode(dst));
 245     emit_long(imm32);
 246   }
 247 }
 248 
 249 // Force generation of a 4 byte immediate value even if it fits into 8bit
 250 void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
 251   assert(isByte(op1) && isByte(op2), "wrong opcode");
 252   assert((op1 & 0x01) == 1, "should be 32bit operation");
 253   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 254   emit_byte(op1);
 255   emit_byte(op2 | encode(dst));
 256   emit_long(imm32);
 257 }
 258 
 259 // immediate-to-memory forms
 260 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
 261   assert((op1 & 0x01) == 1, "should be 32bit operation");
 262   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 263   if (is8bit(imm32)) {
 264     emit_byte(op1 | 0x02); // set sign bit
 265     emit_operand(rm, adr, 1);
 266     emit_byte(imm32 & 0xFF);
 267   } else {
 268     emit_byte(op1);
 269     emit_operand(rm, adr, 4);
 270     emit_long(imm32);
 271   }
 272 }
 273 
 274 
 275 void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
 276   assert(isByte(op1) && isByte(op2), "wrong opcode");
 277   emit_byte(op1);
 278   emit_byte(op2 | encode(dst) << 3 | encode(src));
 279 }
 280 
 281 
 282 void Assembler::emit_operand(Register reg, Register base, Register index,
 283                              Address::ScaleFactor scale, int disp,
 284                              RelocationHolder const& rspec,
 285                              int rip_relative_correction) {
 286   relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
 287 
 288   // Encode the registers as needed in the fields they are used in
 289 
 290   int regenc = encode(reg) << 3;
 291   int indexenc = index->is_valid() ? encode(index) << 3 : 0;
 292   int baseenc = base->is_valid() ? encode(base) : 0;
 293 
 294   if (base->is_valid()) {
 295     if (index->is_valid()) {
 296       assert(scale != Address::no_scale, "inconsistent address");
 297       // [base + index*scale + disp]
 298       if (disp == 0 && rtype == relocInfo::none  &&
 299           base != rbp LP64_ONLY(&& base != r13)) {
 300         // [base + index*scale]
 301         // [00 reg 100][ss index base]
 302         assert(index != rsp, "illegal addressing mode");
 303         emit_byte(0x04 | regenc);
 304         emit_byte(scale << 6 | indexenc | baseenc);
 305       } else if (is8bit(disp) && rtype == relocInfo::none) {
 306         // [base + index*scale + imm8]
 307         // [01 reg 100][ss index base] imm8
 308         assert(index != rsp, "illegal addressing mode");
 309         emit_byte(0x44 | regenc);
 310         emit_byte(scale << 6 | indexenc | baseenc);
 311         emit_byte(disp & 0xFF);
 312       } else {
 313         // [base + index*scale + disp32]
 314         // [10 reg 100][ss index base] disp32
 315         assert(index != rsp, "illegal addressing mode");
 316         emit_byte(0x84 | regenc);
 317         emit_byte(scale << 6 | indexenc | baseenc);
 318         emit_data(disp, rspec, disp32_operand);
 319       }
 320     } else if (base == rsp LP64_ONLY(|| base == r12)) {
 321       // [rsp + disp]
 322       if (disp == 0 && rtype == relocInfo::none) {
 323         // [rsp]
 324         // [00 reg 100][00 100 100]
 325         emit_byte(0x04 | regenc);
 326         emit_byte(0x24);
 327       } else if (is8bit(disp) && rtype == relocInfo::none) {
 328         // [rsp + imm8]
 329         // [01 reg 100][00 100 100] disp8
 330         emit_byte(0x44 | regenc);
 331         emit_byte(0x24);
 332         emit_byte(disp & 0xFF);
 333       } else {
 334         // [rsp + imm32]
 335         // [10 reg 100][00 100 100] disp32
 336         emit_byte(0x84 | regenc);
 337         emit_byte(0x24);
 338         emit_data(disp, rspec, disp32_operand);
 339       }
 340     } else {
 341       // [base + disp]
 342       assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
 343       if (disp == 0 && rtype == relocInfo::none &&
 344           base != rbp LP64_ONLY(&& base != r13)) {
 345         // [base]
 346         // [00 reg base]
 347         emit_byte(0x00 | regenc | baseenc);
 348       } else if (is8bit(disp) && rtype == relocInfo::none) {
 349         // [base + disp8]
 350         // [01 reg base] disp8
 351         emit_byte(0x40 | regenc | baseenc);
 352         emit_byte(disp & 0xFF);
 353       } else {
 354         // [base + disp32]
 355         // [10 reg base] disp32
 356         emit_byte(0x80 | regenc | baseenc);
 357         emit_data(disp, rspec, disp32_operand);
 358       }
 359     }
 360   } else {
 361     if (index->is_valid()) {
 362       assert(scale != Address::no_scale, "inconsistent address");
 363       // [index*scale + disp]
 364       // [00 reg 100][ss index 101] disp32
 365       assert(index != rsp, "illegal addressing mode");
 366       emit_byte(0x04 | regenc);
 367       emit_byte(scale << 6 | indexenc | 0x05);
 368       emit_data(disp, rspec, disp32_operand);
 369     } else if (rtype != relocInfo::none ) {
 370       // [disp] (64bit) RIP-RELATIVE (32bit) abs
 371       // [00 000 101] disp32
 372 
 373       emit_byte(0x05 | regenc);
 374       // Note that the RIP-rel. correction applies to the generated
 375       // disp field, but _not_ to the target address in the rspec.
 376 
 377       // disp was created by converting the target address minus the pc
 378       // at the start of the instruction. That needs more correction here.
 379       // intptr_t disp = target - next_ip;
 380       assert(inst_mark() != NULL, "must be inside InstructionMark");
 381       address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
 382       int64_t adjusted = disp;
 383       // Do rip-rel adjustment for 64bit
 384       LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
 385       assert(is_simm32(adjusted),
 386              "must be 32bit offset (RIP relative address)");
 387       emit_data((int32_t) adjusted, rspec, disp32_operand);
 388 
 389     } else {
 390       // 32bit never did this, did everything as the rip-rel/disp code above
 391       // [disp] ABSOLUTE
 392       // [00 reg 100][00 100 101] disp32
 393       emit_byte(0x04 | regenc);
 394       emit_byte(0x25);
 395       emit_data(disp, rspec, disp32_operand);
 396     }
 397   }
 398 }
 399 
 400 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
 401                              Address::ScaleFactor scale, int disp,
 402                              RelocationHolder const& rspec) {
 403   emit_operand((Register)reg, base, index, scale, disp, rspec);
 404 }
 405 
 406 // Secret local extension to Assembler::WhichOperand:
 407 #define end_pc_operand (_WhichOperand_limit)
 408 
 409 address Assembler::locate_operand(address inst, WhichOperand which) {
 410   // Decode the given instruction, and return the address of
 411   // an embedded 32-bit operand word.
 412 
 413   // If "which" is disp32_operand, selects the displacement portion
 414   // of an effective address specifier.
 415   // If "which" is imm64_operand, selects the trailing immediate constant.
 416   // If "which" is call32_operand, selects the displacement of a call or jump.
 417   // Caller is responsible for ensuring that there is such an operand,
 418   // and that it is 32/64 bits wide.
 419 
 420   // If "which" is end_pc_operand, find the end of the instruction.
 421 
 422   address ip = inst;
 423   bool is_64bit = false;
 424 
 425   debug_only(bool has_disp32 = false);
 426   int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
 427 
 428   again_after_prefix:
 429   switch (0xFF & *ip++) {
 430 
 431   // These convenience macros generate groups of "case" labels for the switch.
 432 #define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
 433 #define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
 434              case (x)+4: case (x)+5: case (x)+6: case (x)+7
 435 #define REP16(x) REP8((x)+0): \
 436               case REP8((x)+8)
 437 
 438   case CS_segment:
 439   case SS_segment:
 440   case DS_segment:
 441   case ES_segment:
 442   case FS_segment:
 443   case GS_segment:
 444     // Seems dubious
 445     LP64_ONLY(assert(false, "shouldn't have that prefix"));
 446     assert(ip == inst+1, "only one prefix allowed");
 447     goto again_after_prefix;
 448 
 449   case 0x67:
 450   case REX:
 451   case REX_B:
 452   case REX_X:
 453   case REX_XB:
 454   case REX_R:
 455   case REX_RB:
 456   case REX_RX:
 457   case REX_RXB:
 458     NOT_LP64(assert(false, "64bit prefixes"));
 459     goto again_after_prefix;
 460 
 461   case REX_W:
 462   case REX_WB:
 463   case REX_WX:
 464   case REX_WXB:
 465   case REX_WR:
 466   case REX_WRB:
 467   case REX_WRX:
 468   case REX_WRXB:
 469     NOT_LP64(assert(false, "64bit prefixes"));
 470     is_64bit = true;
 471     goto again_after_prefix;
 472 
 473   case 0xFF: // pushq a; decl a; incl a; call a; jmp a
 474   case 0x88: // movb a, r
 475   case 0x89: // movl a, r
 476   case 0x8A: // movb r, a
 477   case 0x8B: // movl r, a
 478   case 0x8F: // popl a
 479     debug_only(has_disp32 = true);
 480     break;
 481 
 482   case 0x68: // pushq #32
 483     if (which == end_pc_operand) {
 484       return ip + 4;
 485     }
 486     assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
 487     return ip;                  // not produced by emit_operand
 488 
 489   case 0x66: // movw ... (size prefix)
 490     again_after_size_prefix2:
 491     switch (0xFF & *ip++) {
 492     case REX:
 493     case REX_B:
 494     case REX_X:
 495     case REX_XB:
 496     case REX_R:
 497     case REX_RB:
 498     case REX_RX:
 499     case REX_RXB:
 500     case REX_W:
 501     case REX_WB:
 502     case REX_WX:
 503     case REX_WXB:
 504     case REX_WR:
 505     case REX_WRB:
 506     case REX_WRX:
 507     case REX_WRXB:
 508       NOT_LP64(assert(false, "64bit prefix found"));
 509       goto again_after_size_prefix2;
 510     case 0x8B: // movw r, a
 511     case 0x89: // movw a, r
 512       debug_only(has_disp32 = true);
 513       break;
 514     case 0xC7: // movw a, #16
 515       debug_only(has_disp32 = true);
 516       tail_size = 2;  // the imm16
 517       break;
 518     case 0x0F: // several SSE/SSE2 variants
 519       ip--;    // reparse the 0x0F
 520       goto again_after_prefix;
 521     default:
 522       ShouldNotReachHere();
 523     }
 524     break;
 525 
 526   case REP8(0xB8): // movl/q r, #32/#64(oop?)
 527     if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
 528     // these asserts are somewhat nonsensical
 529 #ifndef _LP64
 530     assert(which == imm_operand || which == disp32_operand,
 531            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 532 #else
 533     assert((which == call32_operand || which == imm_operand) && is_64bit ||
 534            which == narrow_oop_operand && !is_64bit,
 535            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 536 #endif // _LP64
 537     return ip;
 538 
 539   case 0x69: // imul r, a, #32
 540   case 0xC7: // movl a, #32(oop?)
 541     tail_size = 4;
 542     debug_only(has_disp32 = true); // has both kinds of operands!
 543     break;
 544 
 545   case 0x0F: // movx..., etc.
 546     switch (0xFF & *ip++) {
 547     case 0x3A: // pcmpestri
 548       tail_size = 1;
 549     case 0x38: // ptest, pmovzxbw
 550       ip++; // skip opcode
 551       debug_only(has_disp32 = true); // has both kinds of operands!
 552       break;
 553 
 554     case 0x70: // pshufd r, r/a, #8
 555       debug_only(has_disp32 = true); // has both kinds of operands!
 556     case 0x73: // psrldq r, #8
 557       tail_size = 1;
 558       break;
 559 
 560     case 0x12: // movlps
 561     case 0x28: // movaps
 562     case 0x2E: // ucomiss
 563     case 0x2F: // comiss
 564     case 0x54: // andps
 565     case 0x55: // andnps
 566     case 0x56: // orps
 567     case 0x57: // xorps
 568     case 0x6E: // movd
 569     case 0x7E: // movd
 570     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
 571       debug_only(has_disp32 = true);
 572       break;
 573 
 574     case 0xAD: // shrd r, a, %cl
 575     case 0xAF: // imul r, a
 576     case 0xBE: // movsbl r, a (movsxb)
 577     case 0xBF: // movswl r, a (movsxw)
 578     case 0xB6: // movzbl r, a (movzxb)
 579     case 0xB7: // movzwl r, a (movzxw)
 580     case REP16(0x40): // cmovl cc, r, a
 581     case 0xB0: // cmpxchgb
 582     case 0xB1: // cmpxchg
 583     case 0xC1: // xaddl
 584     case 0xC7: // cmpxchg8
 585     case REP16(0x90): // setcc a
 586       debug_only(has_disp32 = true);
 587       // fall out of the switch to decode the address
 588       break;
 589 
 590     case 0xC4: // pinsrw r, a, #8
 591       debug_only(has_disp32 = true);
 592     case 0xC5: // pextrw r, r, #8
 593       tail_size = 1;  // the imm8
 594       break;
 595 
 596     case 0xAC: // shrd r, a, #8
 597       debug_only(has_disp32 = true);
 598       tail_size = 1;  // the imm8
 599       break;
 600 
 601     case REP16(0x80): // jcc rdisp32
 602       if (which == end_pc_operand)  return ip + 4;
 603       assert(which == call32_operand, "jcc has no disp32 or imm");
 604       return ip;
 605     default:
 606       ShouldNotReachHere();
 607     }
 608     break;
 609 
 610   case 0x81: // addl a, #32; addl r, #32
 611     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 612     // on 32bit in the case of cmpl, the imm might be an oop
 613     tail_size = 4;
 614     debug_only(has_disp32 = true); // has both kinds of operands!
 615     break;
 616 
 617   case 0x83: // addl a, #8; addl r, #8
 618     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 619     debug_only(has_disp32 = true); // has both kinds of operands!
 620     tail_size = 1;
 621     break;
 622 
 623   case 0x9B:
 624     switch (0xFF & *ip++) {
 625     case 0xD9: // fnstcw a
 626       debug_only(has_disp32 = true);
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630     }
 631     break;
 632 
 633   case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
 634   case REP4(0x10): // adc...
 635   case REP4(0x20): // and...
 636   case REP4(0x30): // xor...
 637   case REP4(0x08): // or...
 638   case REP4(0x18): // sbb...
 639   case REP4(0x28): // sub...
 640   case 0xF7: // mull a
 641   case 0x8D: // lea r, a
 642   case 0x87: // xchg r, a
 643   case REP4(0x38): // cmp...
 644   case 0x85: // test r, a
 645     debug_only(has_disp32 = true); // has both kinds of operands!
 646     break;
 647 
 648   case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
 649   case 0xC6: // movb a, #8
 650   case 0x80: // cmpb a, #8
 651   case 0x6B: // imul r, a, #8
 652     debug_only(has_disp32 = true); // has both kinds of operands!
 653     tail_size = 1; // the imm8
 654     break;
 655 
 656   case 0xC4: // VEX_3bytes
 657   case 0xC5: // VEX_2bytes
 658     assert((UseAVX > 0), "shouldn't have VEX prefix");
 659     assert(ip == inst+1, "no prefixes allowed");
 660     // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
 661     // but they have prefix 0x0F and processed when 0x0F processed above.
 662     //
 663     // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
 664     // instructions (these instructions are not supported in 64-bit mode).
 665     // To distinguish them bits [7:6] are set in the VEX second byte since
 666     // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
 667     // those VEX bits REX and vvvv bits are inverted.
 668     //
 669     // Fortunately C2 doesn't generate these instructions so we don't need
 670     // to check for them in product version.
 671 
 672     // Check second byte
 673     NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
 674 
 675     // First byte
 676     if ((0xFF & *inst) == VEX_3bytes) {
 677       ip++; // third byte
 678       is_64bit = ((VEX_W & *ip) == VEX_W);
 679     }
 680     ip++; // opcode
 681     // To find the end of instruction (which == end_pc_operand).
 682     switch (0xFF & *ip) {
 683     case 0x61: // pcmpestri r, r/a, #8
 684     case 0x70: // pshufd r, r/a, #8
 685     case 0x73: // psrldq r, #8
 686       tail_size = 1;  // the imm8
 687       break;
 688     default:
 689       break;
 690     }
 691     ip++; // skip opcode
 692     debug_only(has_disp32 = true); // has both kinds of operands!
 693     break;
 694 
 695   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
 696   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
 697   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
 698   case 0xDD: // fld_d a; fst_d a; fstp_d a
 699   case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
 700   case 0xDF: // fild_d a; fistp_d a
 701   case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
 702   case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
 703   case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
 704     debug_only(has_disp32 = true);
 705     break;
 706 
 707   case 0xE8: // call rdisp32
 708   case 0xE9: // jmp  rdisp32
 709     if (which == end_pc_operand)  return ip + 4;
 710     assert(which == call32_operand, "call has no disp32 or imm");
 711     return ip;
 712 
 713   case 0xF0:                    // Lock
 714     assert(os::is_MP(), "only on MP");
 715     goto again_after_prefix;
 716 
 717   case 0xF3:                    // For SSE
 718   case 0xF2:                    // For SSE2
 719     switch (0xFF & *ip++) {
 720     case REX:
 721     case REX_B:
 722     case REX_X:
 723     case REX_XB:
 724     case REX_R:
 725     case REX_RB:
 726     case REX_RX:
 727     case REX_RXB:
 728     case REX_W:
 729     case REX_WB:
 730     case REX_WX:
 731     case REX_WXB:
 732     case REX_WR:
 733     case REX_WRB:
 734     case REX_WRX:
 735     case REX_WRXB:
 736       NOT_LP64(assert(false, "found 64bit prefix"));
 737       ip++;
 738     default:
 739       ip++;
 740     }
 741     debug_only(has_disp32 = true); // has both kinds of operands!
 742     break;
 743 
 744   default:
 745     ShouldNotReachHere();
 746 
 747 #undef REP8
 748 #undef REP16
 749   }
 750 
 751   assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
 752 #ifdef _LP64
 753   assert(which != imm_operand, "instruction is not a movq reg, imm64");
 754 #else
 755   // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
 756   assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
 757 #endif // LP64
 758   assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
 759 
 760   // parse the output of emit_operand
 761   int op2 = 0xFF & *ip++;
 762   int base = op2 & 0x07;
 763   int op3 = -1;
 764   const int b100 = 4;
 765   const int b101 = 5;
 766   if (base == b100 && (op2 >> 6) != 3) {
 767     op3 = 0xFF & *ip++;
 768     base = op3 & 0x07;   // refetch the base
 769   }
 770   // now ip points at the disp (if any)
 771 
 772   switch (op2 >> 6) {
 773   case 0:
 774     // [00 reg  100][ss index base]
 775     // [00 reg  100][00   100  esp]
 776     // [00 reg base]
 777     // [00 reg  100][ss index  101][disp32]
 778     // [00 reg  101]               [disp32]
 779 
 780     if (base == b101) {
 781       if (which == disp32_operand)
 782         return ip;              // caller wants the disp32
 783       ip += 4;                  // skip the disp32
 784     }
 785     break;
 786 
 787   case 1:
 788     // [01 reg  100][ss index base][disp8]
 789     // [01 reg  100][00   100  esp][disp8]
 790     // [01 reg base]               [disp8]
 791     ip += 1;                    // skip the disp8
 792     break;
 793 
 794   case 2:
 795     // [10 reg  100][ss index base][disp32]
 796     // [10 reg  100][00   100  esp][disp32]
 797     // [10 reg base]               [disp32]
 798     if (which == disp32_operand)
 799       return ip;                // caller wants the disp32
 800     ip += 4;                    // skip the disp32
 801     break;
 802 
 803   case 3:
 804     // [11 reg base]  (not a memory addressing mode)
 805     break;
 806   }
 807 
 808   if (which == end_pc_operand) {
 809     return ip + tail_size;
 810   }
 811 
 812 #ifdef _LP64
 813   assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
 814 #else
 815   assert(which == imm_operand, "instruction has only an imm field");
 816 #endif // LP64
 817   return ip;
 818 }
 819 
 820 address Assembler::locate_next_instruction(address inst) {
 821   // Secretly share code with locate_operand:
 822   return locate_operand(inst, end_pc_operand);
 823 }
 824 
 825 
 826 #ifdef ASSERT
 827 void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
 828   address inst = inst_mark();
 829   assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
 830   address opnd;
 831 
 832   Relocation* r = rspec.reloc();
 833   if (r->type() == relocInfo::none) {
 834     return;
 835   } else if (r->is_call() || format == call32_operand) {
 836     // assert(format == imm32_operand, "cannot specify a nonzero format");
 837     opnd = locate_operand(inst, call32_operand);
 838   } else if (r->is_data()) {
 839     assert(format == imm_operand || format == disp32_operand
 840            LP64_ONLY(|| format == narrow_oop_operand), "format ok");
 841     opnd = locate_operand(inst, (WhichOperand)format);
 842   } else {
 843     assert(format == imm_operand, "cannot specify a format");
 844     return;
 845   }
 846   assert(opnd == pc(), "must put operand where relocs can find it");
 847 }
 848 #endif // ASSERT
 849 
 850 void Assembler::emit_operand32(Register reg, Address adr) {
 851   assert(reg->encoding() < 8, "no extended registers");
 852   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 853   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 854                adr._rspec);
 855 }
 856 
 857 void Assembler::emit_operand(Register reg, Address adr,
 858                              int rip_relative_correction) {
 859   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 860                adr._rspec,
 861                rip_relative_correction);
 862 }
 863 
 864 void Assembler::emit_operand(XMMRegister reg, Address adr) {
 865   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 866                adr._rspec);
 867 }
 868 
 869 // MMX operations
 870 void Assembler::emit_operand(MMXRegister reg, Address adr) {
 871   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 872   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 873 }
 874 
 875 // work around gcc (3.2.1-7a) bug
 876 void Assembler::emit_operand(Address adr, MMXRegister reg) {
 877   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 878   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 879 }
 880 
 881 
 882 void Assembler::emit_farith(int b1, int b2, int i) {
 883   assert(isByte(b1) && isByte(b2), "wrong opcode");
 884   assert(0 <= i &&  i < 8, "illegal stack offset");
 885   emit_byte(b1);
 886   emit_byte(b2 + i);
 887 }
 888 
 889 
 890 // Now the Assembler instructions (identical for 32/64 bits)
 891 
 892 void Assembler::adcl(Address dst, int32_t imm32) {
 893   InstructionMark im(this);
 894   prefix(dst);
 895   emit_arith_operand(0x81, rdx, dst, imm32);
 896 }
 897 
 898 void Assembler::adcl(Address dst, Register src) {
 899   InstructionMark im(this);
 900   prefix(dst, src);
 901   emit_byte(0x11);
 902   emit_operand(src, dst);
 903 }
 904 
 905 void Assembler::adcl(Register dst, int32_t imm32) {
 906   prefix(dst);
 907   emit_arith(0x81, 0xD0, dst, imm32);
 908 }
 909 
 910 void Assembler::adcl(Register dst, Address src) {
 911   InstructionMark im(this);
 912   prefix(src, dst);
 913   emit_byte(0x13);
 914   emit_operand(dst, src);
 915 }
 916 
 917 void Assembler::adcl(Register dst, Register src) {
 918   (void) prefix_and_encode(dst->encoding(), src->encoding());
 919   emit_arith(0x13, 0xC0, dst, src);
 920 }
 921 
 922 void Assembler::addl(Address dst, int32_t imm32) {
 923   InstructionMark im(this);
 924   prefix(dst);
 925   emit_arith_operand(0x81, rax, dst, imm32);
 926 }
 927 
 928 void Assembler::addl(Address dst, Register src) {
 929   InstructionMark im(this);
 930   prefix(dst, src);
 931   emit_byte(0x01);
 932   emit_operand(src, dst);
 933 }
 934 
 935 void Assembler::addl(Register dst, int32_t imm32) {
 936   prefix(dst);
 937   emit_arith(0x81, 0xC0, dst, imm32);
 938 }
 939 
 940 void Assembler::addl(Register dst, Address src) {
 941   InstructionMark im(this);
 942   prefix(src, dst);
 943   emit_byte(0x03);
 944   emit_operand(dst, src);
 945 }
 946 
 947 void Assembler::addl(Register dst, Register src) {
 948   (void) prefix_and_encode(dst->encoding(), src->encoding());
 949   emit_arith(0x03, 0xC0, dst, src);
 950 }
 951 
 952 void Assembler::addr_nop_4() {
 953   assert(UseAddressNop, "no CPU support");
 954   // 4 bytes: NOP DWORD PTR [EAX+0]
 955   emit_byte(0x0F);
 956   emit_byte(0x1F);
 957   emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
 958   emit_byte(0);    // 8-bits offset (1 byte)
 959 }
 960 
 961 void Assembler::addr_nop_5() {
 962   assert(UseAddressNop, "no CPU support");
 963   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 964   emit_byte(0x0F);
 965   emit_byte(0x1F);
 966   emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
 967   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 968   emit_byte(0);    // 8-bits offset (1 byte)
 969 }
 970 
 971 void Assembler::addr_nop_7() {
 972   assert(UseAddressNop, "no CPU support");
 973   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 974   emit_byte(0x0F);
 975   emit_byte(0x1F);
 976   emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
 977   emit_long(0);    // 32-bits offset (4 bytes)
 978 }
 979 
 980 void Assembler::addr_nop_8() {
 981   assert(UseAddressNop, "no CPU support");
 982   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 983   emit_byte(0x0F);
 984   emit_byte(0x1F);
 985   emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
 986   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 987   emit_long(0);    // 32-bits offset (4 bytes)
 988 }
 989 
 990 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 991   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 992   emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 993 }
 994 
 995 void Assembler::addsd(XMMRegister dst, Address src) {
 996   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 997   emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 998 }
 999 
1000 void Assembler::addss(XMMRegister dst, XMMRegister src) {
1001   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1002   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
1003 }
1004 
1005 void Assembler::addss(XMMRegister dst, Address src) {
1006   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1007   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
1008 }
1009 
1010 void Assembler::andl(Address dst, int32_t imm32) {
1011   InstructionMark im(this);
1012   prefix(dst);
1013   emit_byte(0x81);
1014   emit_operand(rsp, dst, 4);
1015   emit_long(imm32);
1016 }
1017 
1018 void Assembler::andl(Register dst, int32_t imm32) {
1019   prefix(dst);
1020   emit_arith(0x81, 0xE0, dst, imm32);
1021 }
1022 
1023 void Assembler::andl(Register dst, Address src) {
1024   InstructionMark im(this);
1025   prefix(src, dst);
1026   emit_byte(0x23);
1027   emit_operand(dst, src);
1028 }
1029 
1030 void Assembler::andl(Register dst, Register src) {
1031   (void) prefix_and_encode(dst->encoding(), src->encoding());
1032   emit_arith(0x23, 0xC0, dst, src);
1033 }
1034 
1035 void Assembler::bsfl(Register dst, Register src) {
1036   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1037   emit_byte(0x0F);
1038   emit_byte(0xBC);
1039   emit_byte(0xC0 | encode);
1040 }
1041 
1042 void Assembler::bsrl(Register dst, Register src) {
1043   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
1044   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1045   emit_byte(0x0F);
1046   emit_byte(0xBD);
1047   emit_byte(0xC0 | encode);
1048 }
1049 
1050 void Assembler::bswapl(Register reg) { // bswap
1051   int encode = prefix_and_encode(reg->encoding());
1052   emit_byte(0x0F);
1053   emit_byte(0xC8 | encode);
1054 }
1055 
1056 void Assembler::call(Label& L, relocInfo::relocType rtype) {
1057   // suspect disp32 is always good
1058   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1059 
1060   if (L.is_bound()) {
1061     const int long_size = 5;
1062     int offs = (int)( target(L) - pc() );
1063     assert(offs <= 0, "assembler error");
1064     InstructionMark im(this);
1065     // 1110 1000 #32-bit disp
1066     emit_byte(0xE8);
1067     emit_data(offs - long_size, rtype, operand);
1068   } else {
1069     InstructionMark im(this);
1070     // 1110 1000 #32-bit disp
1071     L.add_patch_at(code(), locator());
1072 
1073     emit_byte(0xE8);
1074     emit_data(int(0), rtype, operand);
1075   }
1076 }
1077 
1078 void Assembler::call(Register dst) {
1079   int encode = prefix_and_encode(dst->encoding());
1080   emit_byte(0xFF);
1081   emit_byte(0xD0 | encode);
1082 }
1083 
1084 
1085 void Assembler::call(Address adr) {
1086   InstructionMark im(this);
1087   prefix(adr);
1088   emit_byte(0xFF);
1089   emit_operand(rdx, adr);
1090 }
1091 
1092 void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1093   assert(entry != NULL, "call most probably wrong");
1094   InstructionMark im(this);
1095   emit_byte(0xE8);
1096   intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1097   assert(is_simm32(disp), "must be 32bit offset (call2)");
1098   // Technically, should use call32_operand, but this format is
1099   // implied by the fact that we're emitting a call instruction.
1100 
1101   int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1102   emit_data((int) disp, rspec, operand);
1103 }
1104 
1105 void Assembler::cdql() {
1106   emit_byte(0x99);
1107 }
1108 
1109 void Assembler::cmovl(Condition cc, Register dst, Register src) {
1110   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1111   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1112   emit_byte(0x0F);
1113   emit_byte(0x40 | cc);
1114   emit_byte(0xC0 | encode);
1115 }
1116 
1117 
1118 void Assembler::cmovl(Condition cc, Register dst, Address src) {
1119   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1120   prefix(src, dst);
1121   emit_byte(0x0F);
1122   emit_byte(0x40 | cc);
1123   emit_operand(dst, src);
1124 }
1125 
1126 void Assembler::cmpb(Address dst, int imm8) {
1127   InstructionMark im(this);
1128   prefix(dst);
1129   emit_byte(0x80);
1130   emit_operand(rdi, dst, 1);
1131   emit_byte(imm8);
1132 }
1133 
1134 void Assembler::cmpl(Address dst, int32_t imm32) {
1135   InstructionMark im(this);
1136   prefix(dst);
1137   emit_byte(0x81);
1138   emit_operand(rdi, dst, 4);
1139   emit_long(imm32);
1140 }
1141 
1142 void Assembler::cmpl(Register dst, int32_t imm32) {
1143   prefix(dst);
1144   emit_arith(0x81, 0xF8, dst, imm32);
1145 }
1146 
1147 void Assembler::cmpl(Register dst, Register src) {
1148   (void) prefix_and_encode(dst->encoding(), src->encoding());
1149   emit_arith(0x3B, 0xC0, dst, src);
1150 }
1151 
1152 
1153 void Assembler::cmpl(Register dst, Address  src) {
1154   InstructionMark im(this);
1155   prefix(src, dst);
1156   emit_byte(0x3B);
1157   emit_operand(dst, src);
1158 }
1159 
1160 void Assembler::cmpw(Address dst, int imm16) {
1161   InstructionMark im(this);
1162   assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1163   emit_byte(0x66);
1164   emit_byte(0x81);
1165   emit_operand(rdi, dst, 2);
1166   emit_word(imm16);
1167 }
1168 
1169 // The 32-bit cmpxchg compares the value at adr with the contents of rax,
1170 // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1171 // The ZF is set if the compared values were equal, and cleared otherwise.
1172 void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1173   if (Atomics & 2) {
1174      // caveat: no instructionmark, so this isn't relocatable.
1175      // Emit a synthetic, non-atomic, CAS equivalent.
1176      // Beware.  The synthetic form sets all ICCs, not just ZF.
1177      // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1178      cmpl(rax, adr);
1179      movl(rax, adr);
1180      if (reg != rax) {
1181         Label L ;
1182         jcc(Assembler::notEqual, L);
1183         movl(adr, reg);
1184         bind(L);
1185      }
1186   } else {
1187      InstructionMark im(this);
1188      prefix(adr, reg);
1189      emit_byte(0x0F);
1190      emit_byte(0xB1);
1191      emit_operand(reg, adr);
1192   }
1193 }
1194 
1195 void Assembler::comisd(XMMRegister dst, Address src) {
1196   // NOTE: dbx seems to decode this as comiss even though the
1197   // 0x66 is there. Strangly ucomisd comes out correct
1198   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1199   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
1200 }
1201 
1202 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
1203   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1204   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
1205 }
1206 
1207 void Assembler::comiss(XMMRegister dst, Address src) {
1208   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1209   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
1210 }
1211 
1212 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
1213   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1214   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
1215 }
1216 
1217 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1218   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1219   emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
1220 }
1221 
1222 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1223   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1224   emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
1225 }
1226 
1227 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1228   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1229   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
1230 }
1231 
1232 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
1233   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1234   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
1235 }
1236 
1237 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1238   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1239   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1240   emit_byte(0x2A);
1241   emit_byte(0xC0 | encode);
1242 }
1243 
1244 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
1245   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1246   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
1247 }
1248 
1249 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1250   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1251   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1252   emit_byte(0x2A);
1253   emit_byte(0xC0 | encode);
1254 }
1255 
1256 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
1257   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1258   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3);
1259 }
1260 
1261 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1262   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1263   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
1264 }
1265 
1266 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
1267   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1268   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
1269 }
1270 
1271 
1272 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1273   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1274   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
1275   emit_byte(0x2C);
1276   emit_byte(0xC0 | encode);
1277 }
1278 
1279 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1280   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1281   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1282   emit_byte(0x2C);
1283   emit_byte(0xC0 | encode);
1284 }
1285 
1286 void Assembler::decl(Address dst) {
1287   // Don't use it directly. Use MacroAssembler::decrement() instead.
1288   InstructionMark im(this);
1289   prefix(dst);
1290   emit_byte(0xFF);
1291   emit_operand(rcx, dst);
1292 }
1293 
1294 void Assembler::divsd(XMMRegister dst, Address src) {
1295   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1296   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
1297 }
1298 
1299 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1300   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1301   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
1302 }
1303 
1304 void Assembler::divss(XMMRegister dst, Address src) {
1305   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1306   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
1307 }
1308 
1309 void Assembler::divss(XMMRegister dst, XMMRegister src) {
1310   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1311   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
1312 }
1313 
1314 void Assembler::emms() {
1315   NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1316   emit_byte(0x0F);
1317   emit_byte(0x77);
1318 }
1319 
1320 void Assembler::hlt() {
1321   emit_byte(0xF4);
1322 }
1323 
1324 void Assembler::idivl(Register src) {
1325   int encode = prefix_and_encode(src->encoding());
1326   emit_byte(0xF7);
1327   emit_byte(0xF8 | encode);
1328 }
1329 
1330 void Assembler::divl(Register src) { // Unsigned
1331   int encode = prefix_and_encode(src->encoding());
1332   emit_byte(0xF7);
1333   emit_byte(0xF0 | encode);
1334 }
1335 
1336 void Assembler::imull(Register dst, Register src) {
1337   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1338   emit_byte(0x0F);
1339   emit_byte(0xAF);
1340   emit_byte(0xC0 | encode);
1341 }
1342 
1343 
1344 void Assembler::imull(Register dst, Register src, int value) {
1345   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1346   if (is8bit(value)) {
1347     emit_byte(0x6B);
1348     emit_byte(0xC0 | encode);
1349     emit_byte(value & 0xFF);
1350   } else {
1351     emit_byte(0x69);
1352     emit_byte(0xC0 | encode);
1353     emit_long(value);
1354   }
1355 }
1356 
1357 void Assembler::incl(Address dst) {
1358   // Don't use it directly. Use MacroAssembler::increment() instead.
1359   InstructionMark im(this);
1360   prefix(dst);
1361   emit_byte(0xFF);
1362   emit_operand(rax, dst);
1363 }
1364 
1365 void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1366   InstructionMark im(this);
1367   assert((0 <= cc) && (cc < 16), "illegal cc");
1368   if (L.is_bound()) {
1369     address dst = target(L);
1370     assert(dst != NULL, "jcc most probably wrong");
1371 
1372     const int short_size = 2;
1373     const int long_size = 6;
1374     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1375     if (maybe_short && is8bit(offs - short_size)) {
1376       // 0111 tttn #8-bit disp
1377       emit_byte(0x70 | cc);
1378       emit_byte((offs - short_size) & 0xFF);
1379     } else {
1380       // 0000 1111 1000 tttn #32-bit disp
1381       assert(is_simm32(offs - long_size),
1382              "must be 32bit offset (call4)");
1383       emit_byte(0x0F);
1384       emit_byte(0x80 | cc);
1385       emit_long(offs - long_size);
1386     }
1387   } else {
1388     // Note: could eliminate cond. jumps to this jump if condition
1389     //       is the same however, seems to be rather unlikely case.
1390     // Note: use jccb() if label to be bound is very close to get
1391     //       an 8-bit displacement
1392     L.add_patch_at(code(), locator());
1393     emit_byte(0x0F);
1394     emit_byte(0x80 | cc);
1395     emit_long(0);
1396   }
1397 }
1398 
1399 void Assembler::jccb(Condition cc, Label& L) {
1400   if (L.is_bound()) {
1401     const int short_size = 2;
1402     address entry = target(L);
1403 #ifdef ASSERT
1404     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1405     intptr_t delta = short_branch_delta();
1406     if (delta != 0) {
1407       dist += (dist < 0 ? (-delta) :delta);
1408     }
1409     assert(is8bit(dist), "Dispacement too large for a short jmp");
1410 #endif
1411     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1412     // 0111 tttn #8-bit disp
1413     emit_byte(0x70 | cc);
1414     emit_byte((offs - short_size) & 0xFF);
1415   } else {
1416     InstructionMark im(this);
1417     L.add_patch_at(code(), locator());
1418     emit_byte(0x70 | cc);
1419     emit_byte(0);
1420   }
1421 }
1422 
1423 void Assembler::jmp(Address adr) {
1424   InstructionMark im(this);
1425   prefix(adr);
1426   emit_byte(0xFF);
1427   emit_operand(rsp, adr);
1428 }
1429 
1430 void Assembler::jmp(Label& L, bool maybe_short) {
1431   if (L.is_bound()) {
1432     address entry = target(L);
1433     assert(entry != NULL, "jmp most probably wrong");
1434     InstructionMark im(this);
1435     const int short_size = 2;
1436     const int long_size = 5;
1437     intptr_t offs = entry - _code_pos;
1438     if (maybe_short && is8bit(offs - short_size)) {
1439       emit_byte(0xEB);
1440       emit_byte((offs - short_size) & 0xFF);
1441     } else {
1442       emit_byte(0xE9);
1443       emit_long(offs - long_size);
1444     }
1445   } else {
1446     // By default, forward jumps are always 32-bit displacements, since
1447     // we can't yet know where the label will be bound.  If you're sure that
1448     // the forward jump will not run beyond 256 bytes, use jmpb to
1449     // force an 8-bit displacement.
1450     InstructionMark im(this);
1451     L.add_patch_at(code(), locator());
1452     emit_byte(0xE9);
1453     emit_long(0);
1454   }
1455 }
1456 
1457 void Assembler::jmp(Register entry) {
1458   int encode = prefix_and_encode(entry->encoding());
1459   emit_byte(0xFF);
1460   emit_byte(0xE0 | encode);
1461 }
1462 
1463 void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1464   InstructionMark im(this);
1465   emit_byte(0xE9);
1466   assert(dest != NULL, "must have a target");
1467   intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1468   assert(is_simm32(disp), "must be 32bit offset (jmp)");
1469   emit_data(disp, rspec.reloc(), call32_operand);
1470 }
1471 
1472 void Assembler::jmpb(Label& L) {
1473   if (L.is_bound()) {
1474     const int short_size = 2;
1475     address entry = target(L);
1476     assert(entry != NULL, "jmp most probably wrong");
1477 #ifdef ASSERT
1478     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1479     intptr_t delta = short_branch_delta();
1480     if (delta != 0) {
1481       dist += (dist < 0 ? (-delta) :delta);
1482     }
1483     assert(is8bit(dist), "Dispacement too large for a short jmp");
1484 #endif
1485     intptr_t offs = entry - _code_pos;
1486     emit_byte(0xEB);
1487     emit_byte((offs - short_size) & 0xFF);
1488   } else {
1489     InstructionMark im(this);
1490     L.add_patch_at(code(), locator());
1491     emit_byte(0xEB);
1492     emit_byte(0);
1493   }
1494 }
1495 
1496 void Assembler::ldmxcsr( Address src) {
1497   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1498   InstructionMark im(this);
1499   prefix(src);
1500   emit_byte(0x0F);
1501   emit_byte(0xAE);
1502   emit_operand(as_Register(2), src);
1503 }
1504 
1505 void Assembler::leal(Register dst, Address src) {
1506   InstructionMark im(this);
1507 #ifdef _LP64
1508   emit_byte(0x67); // addr32
1509   prefix(src, dst);
1510 #endif // LP64
1511   emit_byte(0x8D);
1512   emit_operand(dst, src);
1513 }
1514 
1515 void Assembler::lock() {
1516   if (Atomics & 1) {
1517      // Emit either nothing, a NOP, or a NOP: prefix
1518      emit_byte(0x90) ;
1519   } else {
1520      emit_byte(0xF0);
1521   }
1522 }
1523 
1524 void Assembler::lzcntl(Register dst, Register src) {
1525   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1526   emit_byte(0xF3);
1527   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1528   emit_byte(0x0F);
1529   emit_byte(0xBD);
1530   emit_byte(0xC0 | encode);
1531 }
1532 
1533 // Emit mfence instruction
1534 void Assembler::mfence() {
1535   NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1536   emit_byte( 0x0F );
1537   emit_byte( 0xAE );
1538   emit_byte( 0xF0 );
1539 }
1540 
1541 void Assembler::mov(Register dst, Register src) {
1542   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1543 }
1544 
1545 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1546   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1547   emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
1548 }
1549 
1550 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1551   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1552   emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
1553 }
1554 
1555 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
1556   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1557   int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
1558   emit_byte(0x16);
1559   emit_byte(0xC0 | encode);
1560 }
1561 
1562 void Assembler::movb(Register dst, Address src) {
1563   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1564   InstructionMark im(this);
1565   prefix(src, dst, true);
1566   emit_byte(0x8A);
1567   emit_operand(dst, src);
1568 }
1569 
1570 
1571 void Assembler::movb(Address dst, int imm8) {
1572   InstructionMark im(this);
1573    prefix(dst);
1574   emit_byte(0xC6);
1575   emit_operand(rax, dst, 1);
1576   emit_byte(imm8);
1577 }
1578 
1579 
1580 void Assembler::movb(Address dst, Register src) {
1581   assert(src->has_byte_register(), "must have byte register");
1582   InstructionMark im(this);
1583   prefix(dst, src, true);
1584   emit_byte(0x88);
1585   emit_operand(src, dst);
1586 }
1587 
1588 void Assembler::movdl(XMMRegister dst, Register src) {
1589   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1590   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1591   emit_byte(0x6E);
1592   emit_byte(0xC0 | encode);
1593 }
1594 
1595 void Assembler::movdl(Register dst, XMMRegister src) {
1596   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1597   // swap src/dst to get correct prefix
1598   int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
1599   emit_byte(0x7E);
1600   emit_byte(0xC0 | encode);
1601 }
1602 
1603 void Assembler::movdl(XMMRegister dst, Address src) {
1604   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1605   InstructionMark im(this);
1606   simd_prefix(dst, src, VEX_SIMD_66);
1607   emit_byte(0x6E);
1608   emit_operand(dst, src);
1609 }
1610 
1611 void Assembler::movdl(Address dst, XMMRegister src) {
1612   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1613   InstructionMark im(this);
1614   simd_prefix(dst, src, VEX_SIMD_66);
1615   emit_byte(0x7E);
1616   emit_operand(src, dst);
1617 }
1618 
1619 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1620   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1621   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
1622 }
1623 
1624 void Assembler::movdqu(XMMRegister dst, Address src) {
1625   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1626   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
1627 }
1628 
1629 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1630   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1631   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
1632 }
1633 
1634 void Assembler::movdqu(Address dst, XMMRegister src) {
1635   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1636   InstructionMark im(this);
1637   simd_prefix(dst, src, VEX_SIMD_F3);
1638   emit_byte(0x7F);
1639   emit_operand(src, dst);
1640 }
1641 
1642 // Move Unaligned 256bit Vector
1643 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
1644   assert(UseAVX, "");
1645   bool vector256 = true;
1646   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1647   emit_byte(0x6F);
1648   emit_byte(0xC0 | encode);
1649 }
1650 
1651 void Assembler::vmovdqu(XMMRegister dst, Address src) {
1652   assert(UseAVX, "");
1653   InstructionMark im(this);
1654   bool vector256 = true;
1655   vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1656   emit_byte(0x6F);
1657   emit_operand(dst, src);
1658 }
1659 
1660 void Assembler::vmovdqu(Address dst, XMMRegister src) {
1661   assert(UseAVX, "");
1662   InstructionMark im(this);
1663   bool vector256 = true;
1664   // swap src<->dst for encoding
1665   assert(src != xnoreg, "sanity");
1666   vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
1667   emit_byte(0x7F);
1668   emit_operand(src, dst);
1669 }
1670 
1671 // Uses zero extension on 64bit
1672 
1673 void Assembler::movl(Register dst, int32_t imm32) {
1674   int encode = prefix_and_encode(dst->encoding());
1675   emit_byte(0xB8 | encode);
1676   emit_long(imm32);
1677 }
1678 
1679 void Assembler::movl(Register dst, Register src) {
1680   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1681   emit_byte(0x8B);
1682   emit_byte(0xC0 | encode);
1683 }
1684 
1685 void Assembler::movl(Register dst, Address src) {
1686   InstructionMark im(this);
1687   prefix(src, dst);
1688   emit_byte(0x8B);
1689   emit_operand(dst, src);
1690 }
1691 
1692 void Assembler::movl(Address dst, int32_t imm32) {
1693   InstructionMark im(this);
1694   prefix(dst);
1695   emit_byte(0xC7);
1696   emit_operand(rax, dst, 4);
1697   emit_long(imm32);
1698 }
1699 
1700 void Assembler::movl(Address dst, Register src) {
1701   InstructionMark im(this);
1702   prefix(dst, src);
1703   emit_byte(0x89);
1704   emit_operand(src, dst);
1705 }
1706 
1707 // New cpus require to use movsd and movss to avoid partial register stall
1708 // when loading from memory. But for old Opteron use movlpd instead of movsd.
1709 // The selection is done in MacroAssembler::movdbl() and movflt().
1710 void Assembler::movlpd(XMMRegister dst, Address src) {
1711   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1712   emit_simd_arith(0x12, dst, src, VEX_SIMD_66);
1713 }
1714 
1715 void Assembler::movq( MMXRegister dst, Address src ) {
1716   assert( VM_Version::supports_mmx(), "" );
1717   emit_byte(0x0F);
1718   emit_byte(0x6F);
1719   emit_operand(dst, src);
1720 }
1721 
1722 void Assembler::movq( Address dst, MMXRegister src ) {
1723   assert( VM_Version::supports_mmx(), "" );
1724   emit_byte(0x0F);
1725   emit_byte(0x7F);
1726   // workaround gcc (3.2.1-7a) bug
1727   // In that version of gcc with only an emit_operand(MMX, Address)
1728   // gcc will tail jump and try and reverse the parameters completely
1729   // obliterating dst in the process. By having a version available
1730   // that doesn't need to swap the args at the tail jump the bug is
1731   // avoided.
1732   emit_operand(dst, src);
1733 }
1734 
1735 void Assembler::movq(XMMRegister dst, Address src) {
1736   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1737   InstructionMark im(this);
1738   simd_prefix(dst, src, VEX_SIMD_F3);
1739   emit_byte(0x7E);
1740   emit_operand(dst, src);
1741 }
1742 
1743 void Assembler::movq(Address dst, XMMRegister src) {
1744   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1745   InstructionMark im(this);
1746   simd_prefix(dst, src, VEX_SIMD_66);
1747   emit_byte(0xD6);
1748   emit_operand(src, dst);
1749 }
1750 
1751 void Assembler::movsbl(Register dst, Address src) { // movsxb
1752   InstructionMark im(this);
1753   prefix(src, dst);
1754   emit_byte(0x0F);
1755   emit_byte(0xBE);
1756   emit_operand(dst, src);
1757 }
1758 
1759 void Assembler::movsbl(Register dst, Register src) { // movsxb
1760   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1761   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1762   emit_byte(0x0F);
1763   emit_byte(0xBE);
1764   emit_byte(0xC0 | encode);
1765 }
1766 
1767 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1768   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1769   emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
1770 }
1771 
1772 void Assembler::movsd(XMMRegister dst, Address src) {
1773   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1774   emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
1775 }
1776 
1777 void Assembler::movsd(Address dst, XMMRegister src) {
1778   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1779   InstructionMark im(this);
1780   simd_prefix(dst, src, VEX_SIMD_F2);
1781   emit_byte(0x11);
1782   emit_operand(src, dst);
1783 }
1784 
1785 void Assembler::movss(XMMRegister dst, XMMRegister src) {
1786   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1787   emit_simd_arith(0x10, dst, src, VEX_SIMD_F3);
1788 }
1789 
1790 void Assembler::movss(XMMRegister dst, Address src) {
1791   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1792   emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3);
1793 }
1794 
1795 void Assembler::movss(Address dst, XMMRegister src) {
1796   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1797   InstructionMark im(this);
1798   simd_prefix(dst, src, VEX_SIMD_F3);
1799   emit_byte(0x11);
1800   emit_operand(src, dst);
1801 }
1802 
1803 void Assembler::movswl(Register dst, Address src) { // movsxw
1804   InstructionMark im(this);
1805   prefix(src, dst);
1806   emit_byte(0x0F);
1807   emit_byte(0xBF);
1808   emit_operand(dst, src);
1809 }
1810 
1811 void Assembler::movswl(Register dst, Register src) { // movsxw
1812   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1813   emit_byte(0x0F);
1814   emit_byte(0xBF);
1815   emit_byte(0xC0 | encode);
1816 }
1817 
1818 void Assembler::movw(Address dst, int imm16) {
1819   InstructionMark im(this);
1820 
1821   emit_byte(0x66); // switch to 16-bit mode
1822   prefix(dst);
1823   emit_byte(0xC7);
1824   emit_operand(rax, dst, 2);
1825   emit_word(imm16);
1826 }
1827 
1828 void Assembler::movw(Register dst, Address src) {
1829   InstructionMark im(this);
1830   emit_byte(0x66);
1831   prefix(src, dst);
1832   emit_byte(0x8B);
1833   emit_operand(dst, src);
1834 }
1835 
1836 void Assembler::movw(Address dst, Register src) {
1837   InstructionMark im(this);
1838   emit_byte(0x66);
1839   prefix(dst, src);
1840   emit_byte(0x89);
1841   emit_operand(src, dst);
1842 }
1843 
1844 void Assembler::movzbl(Register dst, Address src) { // movzxb
1845   InstructionMark im(this);
1846   prefix(src, dst);
1847   emit_byte(0x0F);
1848   emit_byte(0xB6);
1849   emit_operand(dst, src);
1850 }
1851 
1852 void Assembler::movzbl(Register dst, Register src) { // movzxb
1853   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1854   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1855   emit_byte(0x0F);
1856   emit_byte(0xB6);
1857   emit_byte(0xC0 | encode);
1858 }
1859 
1860 void Assembler::movzwl(Register dst, Address src) { // movzxw
1861   InstructionMark im(this);
1862   prefix(src, dst);
1863   emit_byte(0x0F);
1864   emit_byte(0xB7);
1865   emit_operand(dst, src);
1866 }
1867 
1868 void Assembler::movzwl(Register dst, Register src) { // movzxw
1869   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1870   emit_byte(0x0F);
1871   emit_byte(0xB7);
1872   emit_byte(0xC0 | encode);
1873 }
1874 
1875 void Assembler::mull(Address src) {
1876   InstructionMark im(this);
1877   prefix(src);
1878   emit_byte(0xF7);
1879   emit_operand(rsp, src);
1880 }
1881 
1882 void Assembler::mull(Register src) {
1883   int encode = prefix_and_encode(src->encoding());
1884   emit_byte(0xF7);
1885   emit_byte(0xE0 | encode);
1886 }
1887 
1888 void Assembler::mulsd(XMMRegister dst, Address src) {
1889   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1890   emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
1891 }
1892 
1893 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
1894   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1895   emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
1896 }
1897 
1898 void Assembler::mulss(XMMRegister dst, Address src) {
1899   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1900   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
1901 }
1902 
1903 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
1904   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1905   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
1906 }
1907 
1908 void Assembler::negl(Register dst) {
1909   int encode = prefix_and_encode(dst->encoding());
1910   emit_byte(0xF7);
1911   emit_byte(0xD8 | encode);
1912 }
1913 
1914 void Assembler::nop(int i) {
1915 #ifdef ASSERT
1916   assert(i > 0, " ");
1917   // The fancy nops aren't currently recognized by debuggers making it a
1918   // pain to disassemble code while debugging. If asserts are on clearly
1919   // speed is not an issue so simply use the single byte traditional nop
1920   // to do alignment.
1921 
1922   for (; i > 0 ; i--) emit_byte(0x90);
1923   return;
1924 
1925 #endif // ASSERT
1926 
1927   if (UseAddressNop && VM_Version::is_intel()) {
1928     //
1929     // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
1930     //  1: 0x90
1931     //  2: 0x66 0x90
1932     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
1933     //  4: 0x0F 0x1F 0x40 0x00
1934     //  5: 0x0F 0x1F 0x44 0x00 0x00
1935     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
1936     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1937     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1938     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1939     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1940     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1941 
1942     // The rest coding is Intel specific - don't use consecutive address nops
1943 
1944     // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1945     // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1946     // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1947     // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1948 
1949     while(i >= 15) {
1950       // For Intel don't generate consecutive addess nops (mix with regular nops)
1951       i -= 15;
1952       emit_byte(0x66);   // size prefix
1953       emit_byte(0x66);   // size prefix
1954       emit_byte(0x66);   // size prefix
1955       addr_nop_8();
1956       emit_byte(0x66);   // size prefix
1957       emit_byte(0x66);   // size prefix
1958       emit_byte(0x66);   // size prefix
1959       emit_byte(0x90);   // nop
1960     }
1961     switch (i) {
1962       case 14:
1963         emit_byte(0x66); // size prefix
1964       case 13:
1965         emit_byte(0x66); // size prefix
1966       case 12:
1967         addr_nop_8();
1968         emit_byte(0x66); // size prefix
1969         emit_byte(0x66); // size prefix
1970         emit_byte(0x66); // size prefix
1971         emit_byte(0x90); // nop
1972         break;
1973       case 11:
1974         emit_byte(0x66); // size prefix
1975       case 10:
1976         emit_byte(0x66); // size prefix
1977       case 9:
1978         emit_byte(0x66); // size prefix
1979       case 8:
1980         addr_nop_8();
1981         break;
1982       case 7:
1983         addr_nop_7();
1984         break;
1985       case 6:
1986         emit_byte(0x66); // size prefix
1987       case 5:
1988         addr_nop_5();
1989         break;
1990       case 4:
1991         addr_nop_4();
1992         break;
1993       case 3:
1994         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
1995         emit_byte(0x66); // size prefix
1996       case 2:
1997         emit_byte(0x66); // size prefix
1998       case 1:
1999         emit_byte(0x90); // nop
2000         break;
2001       default:
2002         assert(i == 0, " ");
2003     }
2004     return;
2005   }
2006   if (UseAddressNop && VM_Version::is_amd()) {
2007     //
2008     // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2009     //  1: 0x90
2010     //  2: 0x66 0x90
2011     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2012     //  4: 0x0F 0x1F 0x40 0x00
2013     //  5: 0x0F 0x1F 0x44 0x00 0x00
2014     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2015     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2016     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2017     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2018     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2019     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2020 
2021     // The rest coding is AMD specific - use consecutive address nops
2022 
2023     // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2024     // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2025     // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2026     // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2027     // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2028     //     Size prefixes (0x66) are added for larger sizes
2029 
2030     while(i >= 22) {
2031       i -= 11;
2032       emit_byte(0x66); // size prefix
2033       emit_byte(0x66); // size prefix
2034       emit_byte(0x66); // size prefix
2035       addr_nop_8();
2036     }
2037     // Generate first nop for size between 21-12
2038     switch (i) {
2039       case 21:
2040         i -= 1;
2041         emit_byte(0x66); // size prefix
2042       case 20:
2043       case 19:
2044         i -= 1;
2045         emit_byte(0x66); // size prefix
2046       case 18:
2047       case 17:
2048         i -= 1;
2049         emit_byte(0x66); // size prefix
2050       case 16:
2051       case 15:
2052         i -= 8;
2053         addr_nop_8();
2054         break;
2055       case 14:
2056       case 13:
2057         i -= 7;
2058         addr_nop_7();
2059         break;
2060       case 12:
2061         i -= 6;
2062         emit_byte(0x66); // size prefix
2063         addr_nop_5();
2064         break;
2065       default:
2066         assert(i < 12, " ");
2067     }
2068 
2069     // Generate second nop for size between 11-1
2070     switch (i) {
2071       case 11:
2072         emit_byte(0x66); // size prefix
2073       case 10:
2074         emit_byte(0x66); // size prefix
2075       case 9:
2076         emit_byte(0x66); // size prefix
2077       case 8:
2078         addr_nop_8();
2079         break;
2080       case 7:
2081         addr_nop_7();
2082         break;
2083       case 6:
2084         emit_byte(0x66); // size prefix
2085       case 5:
2086         addr_nop_5();
2087         break;
2088       case 4:
2089         addr_nop_4();
2090         break;
2091       case 3:
2092         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2093         emit_byte(0x66); // size prefix
2094       case 2:
2095         emit_byte(0x66); // size prefix
2096       case 1:
2097         emit_byte(0x90); // nop
2098         break;
2099       default:
2100         assert(i == 0, " ");
2101     }
2102     return;
2103   }
2104 
2105   // Using nops with size prefixes "0x66 0x90".
2106   // From AMD Optimization Guide:
2107   //  1: 0x90
2108   //  2: 0x66 0x90
2109   //  3: 0x66 0x66 0x90
2110   //  4: 0x66 0x66 0x66 0x90
2111   //  5: 0x66 0x66 0x90 0x66 0x90
2112   //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2113   //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2114   //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2115   //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2116   // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2117   //
2118   while(i > 12) {
2119     i -= 4;
2120     emit_byte(0x66); // size prefix
2121     emit_byte(0x66);
2122     emit_byte(0x66);
2123     emit_byte(0x90); // nop
2124   }
2125   // 1 - 12 nops
2126   if(i > 8) {
2127     if(i > 9) {
2128       i -= 1;
2129       emit_byte(0x66);
2130     }
2131     i -= 3;
2132     emit_byte(0x66);
2133     emit_byte(0x66);
2134     emit_byte(0x90);
2135   }
2136   // 1 - 8 nops
2137   if(i > 4) {
2138     if(i > 6) {
2139       i -= 1;
2140       emit_byte(0x66);
2141     }
2142     i -= 3;
2143     emit_byte(0x66);
2144     emit_byte(0x66);
2145     emit_byte(0x90);
2146   }
2147   switch (i) {
2148     case 4:
2149       emit_byte(0x66);
2150     case 3:
2151       emit_byte(0x66);
2152     case 2:
2153       emit_byte(0x66);
2154     case 1:
2155       emit_byte(0x90);
2156       break;
2157     default:
2158       assert(i == 0, " ");
2159   }
2160 }
2161 
2162 void Assembler::notl(Register dst) {
2163   int encode = prefix_and_encode(dst->encoding());
2164   emit_byte(0xF7);
2165   emit_byte(0xD0 | encode );
2166 }
2167 
2168 void Assembler::orl(Address dst, int32_t imm32) {
2169   InstructionMark im(this);
2170   prefix(dst);
2171   emit_arith_operand(0x81, rcx, dst, imm32);
2172 }
2173 
2174 void Assembler::orl(Register dst, int32_t imm32) {
2175   prefix(dst);
2176   emit_arith(0x81, 0xC8, dst, imm32);
2177 }
2178 
2179 void Assembler::orl(Register dst, Address src) {
2180   InstructionMark im(this);
2181   prefix(src, dst);
2182   emit_byte(0x0B);
2183   emit_operand(dst, src);
2184 }
2185 
2186 void Assembler::orl(Register dst, Register src) {
2187   (void) prefix_and_encode(dst->encoding(), src->encoding());
2188   emit_arith(0x0B, 0xC0, dst, src);
2189 }
2190 
2191 void Assembler::packuswb(XMMRegister dst, Address src) {
2192   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2193   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2194   emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
2195 }
2196 
2197 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
2198   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2199   emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
2200 }
2201 
2202 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2203   assert(VM_Version::supports_sse4_2(), "");
2204   InstructionMark im(this);
2205   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2206   emit_byte(0x61);
2207   emit_operand(dst, src);
2208   emit_byte(imm8);
2209 }
2210 
2211 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2212   assert(VM_Version::supports_sse4_2(), "");
2213   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2214   emit_byte(0x61);
2215   emit_byte(0xC0 | encode);
2216   emit_byte(imm8);
2217 }
2218 
2219 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
2220   assert(VM_Version::supports_sse4_1(), "");
2221   InstructionMark im(this);
2222   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2223   emit_byte(0x30);
2224   emit_operand(dst, src);
2225 }
2226 
2227 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2228   assert(VM_Version::supports_sse4_1(), "");
2229   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2230   emit_byte(0x30);
2231   emit_byte(0xC0 | encode);
2232 }
2233 
2234 // generic
2235 void Assembler::pop(Register dst) {
2236   int encode = prefix_and_encode(dst->encoding());
2237   emit_byte(0x58 | encode);
2238 }
2239 
2240 void Assembler::popcntl(Register dst, Address src) {
2241   assert(VM_Version::supports_popcnt(), "must support");
2242   InstructionMark im(this);
2243   emit_byte(0xF3);
2244   prefix(src, dst);
2245   emit_byte(0x0F);
2246   emit_byte(0xB8);
2247   emit_operand(dst, src);
2248 }
2249 
2250 void Assembler::popcntl(Register dst, Register src) {
2251   assert(VM_Version::supports_popcnt(), "must support");
2252   emit_byte(0xF3);
2253   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2254   emit_byte(0x0F);
2255   emit_byte(0xB8);
2256   emit_byte(0xC0 | encode);
2257 }
2258 
2259 void Assembler::popf() {
2260   emit_byte(0x9D);
2261 }
2262 
2263 #ifndef _LP64 // no 32bit push/pop on amd64
2264 void Assembler::popl(Address dst) {
2265   // NOTE: this will adjust stack by 8byte on 64bits
2266   InstructionMark im(this);
2267   prefix(dst);
2268   emit_byte(0x8F);
2269   emit_operand(rax, dst);
2270 }
2271 #endif
2272 
2273 void Assembler::prefetch_prefix(Address src) {
2274   prefix(src);
2275   emit_byte(0x0F);
2276 }
2277 
2278 void Assembler::prefetchnta(Address src) {
2279   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2280   InstructionMark im(this);
2281   prefetch_prefix(src);
2282   emit_byte(0x18);
2283   emit_operand(rax, src); // 0, src
2284 }
2285 
2286 void Assembler::prefetchr(Address src) {
2287   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2288   InstructionMark im(this);
2289   prefetch_prefix(src);
2290   emit_byte(0x0D);
2291   emit_operand(rax, src); // 0, src
2292 }
2293 
2294 void Assembler::prefetcht0(Address src) {
2295   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2296   InstructionMark im(this);
2297   prefetch_prefix(src);
2298   emit_byte(0x18);
2299   emit_operand(rcx, src); // 1, src
2300 }
2301 
2302 void Assembler::prefetcht1(Address src) {
2303   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2304   InstructionMark im(this);
2305   prefetch_prefix(src);
2306   emit_byte(0x18);
2307   emit_operand(rdx, src); // 2, src
2308 }
2309 
2310 void Assembler::prefetcht2(Address src) {
2311   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2312   InstructionMark im(this);
2313   prefetch_prefix(src);
2314   emit_byte(0x18);
2315   emit_operand(rbx, src); // 3, src
2316 }
2317 
2318 void Assembler::prefetchw(Address src) {
2319   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2320   InstructionMark im(this);
2321   prefetch_prefix(src);
2322   emit_byte(0x0D);
2323   emit_operand(rcx, src); // 1, src
2324 }
2325 
2326 void Assembler::prefix(Prefix p) {
2327   a_byte(p);
2328 }
2329 
2330 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2331   assert(isByte(mode), "invalid value");
2332   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2333   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
2334   emit_byte(mode & 0xFF);
2335 
2336 }
2337 
2338 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2339   assert(isByte(mode), "invalid value");
2340   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2341   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2342   InstructionMark im(this);
2343   simd_prefix(dst, src, VEX_SIMD_66);
2344   emit_byte(0x70);
2345   emit_operand(dst, src);
2346   emit_byte(mode & 0xFF);
2347 }
2348 
2349 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2350   assert(isByte(mode), "invalid value");
2351   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2352   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2);
2353   emit_byte(mode & 0xFF);
2354 }
2355 
2356 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2357   assert(isByte(mode), "invalid value");
2358   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2359   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2360   InstructionMark im(this);
2361   simd_prefix(dst, src, VEX_SIMD_F2);
2362   emit_byte(0x70);
2363   emit_operand(dst, src);
2364   emit_byte(mode & 0xFF);
2365 }
2366 
2367 void Assembler::psrldq(XMMRegister dst, int shift) {
2368   // Shift 128 bit value in xmm register by number of bytes.
2369   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2370   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
2371   emit_byte(0x73);
2372   emit_byte(0xC0 | encode);
2373   emit_byte(shift);
2374 }
2375 
2376 void Assembler::ptest(XMMRegister dst, Address src) {
2377   assert(VM_Version::supports_sse4_1(), "");
2378   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2379   InstructionMark im(this);
2380   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2381   emit_byte(0x17);
2382   emit_operand(dst, src);
2383 }
2384 
2385 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2386   assert(VM_Version::supports_sse4_1(), "");
2387   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2388   emit_byte(0x17);
2389   emit_byte(0xC0 | encode);
2390 }
2391 
2392 void Assembler::punpcklbw(XMMRegister dst, Address src) {
2393   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2394   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2395   emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
2396 }
2397 
2398 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2399   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2400   emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
2401 }
2402 
2403 void Assembler::punpckldq(XMMRegister dst, Address src) {
2404   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2405   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2406   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
2407 }
2408 
2409 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
2410   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2411   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
2412 }
2413 
2414 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
2415   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2416   emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
2417 }
2418 
2419 void Assembler::push(int32_t imm32) {
2420   // in 64bits we push 64bits onto the stack but only
2421   // take a 32bit immediate
2422   emit_byte(0x68);
2423   emit_long(imm32);
2424 }
2425 
2426 void Assembler::push(Register src) {
2427   int encode = prefix_and_encode(src->encoding());
2428 
2429   emit_byte(0x50 | encode);
2430 }
2431 
2432 void Assembler::pushf() {
2433   emit_byte(0x9C);
2434 }
2435 
2436 #ifndef _LP64 // no 32bit push/pop on amd64
2437 void Assembler::pushl(Address src) {
2438   // Note this will push 64bit on 64bit
2439   InstructionMark im(this);
2440   prefix(src);
2441   emit_byte(0xFF);
2442   emit_operand(rsi, src);
2443 }
2444 #endif
2445 
2446 void Assembler::rcll(Register dst, int imm8) {
2447   assert(isShiftCount(imm8), "illegal shift count");
2448   int encode = prefix_and_encode(dst->encoding());
2449   if (imm8 == 1) {
2450     emit_byte(0xD1);
2451     emit_byte(0xD0 | encode);
2452   } else {
2453     emit_byte(0xC1);
2454     emit_byte(0xD0 | encode);
2455     emit_byte(imm8);
2456   }
2457 }
2458 
2459 // copies data from [esi] to [edi] using rcx pointer sized words
2460 // generic
2461 void Assembler::rep_mov() {
2462   emit_byte(0xF3);
2463   // MOVSQ
2464   LP64_ONLY(prefix(REX_W));
2465   emit_byte(0xA5);
2466 }
2467 
2468 // sets rcx pointer sized words with rax, value at [edi]
2469 // generic
2470 void Assembler::rep_set() { // rep_set
2471   emit_byte(0xF3);
2472   // STOSQ
2473   LP64_ONLY(prefix(REX_W));
2474   emit_byte(0xAB);
2475 }
2476 
2477 // scans rcx pointer sized words at [edi] for occurance of rax,
2478 // generic
2479 void Assembler::repne_scan() { // repne_scan
2480   emit_byte(0xF2);
2481   // SCASQ
2482   LP64_ONLY(prefix(REX_W));
2483   emit_byte(0xAF);
2484 }
2485 
2486 #ifdef _LP64
2487 // scans rcx 4 byte words at [edi] for occurance of rax,
2488 // generic
2489 void Assembler::repne_scanl() { // repne_scan
2490   emit_byte(0xF2);
2491   // SCASL
2492   emit_byte(0xAF);
2493 }
2494 #endif
2495 
2496 void Assembler::ret(int imm16) {
2497   if (imm16 == 0) {
2498     emit_byte(0xC3);
2499   } else {
2500     emit_byte(0xC2);
2501     emit_word(imm16);
2502   }
2503 }
2504 
2505 void Assembler::sahf() {
2506 #ifdef _LP64
2507   // Not supported in 64bit mode
2508   ShouldNotReachHere();
2509 #endif
2510   emit_byte(0x9E);
2511 }
2512 
2513 void Assembler::sarl(Register dst, int imm8) {
2514   int encode = prefix_and_encode(dst->encoding());
2515   assert(isShiftCount(imm8), "illegal shift count");
2516   if (imm8 == 1) {
2517     emit_byte(0xD1);
2518     emit_byte(0xF8 | encode);
2519   } else {
2520     emit_byte(0xC1);
2521     emit_byte(0xF8 | encode);
2522     emit_byte(imm8);
2523   }
2524 }
2525 
2526 void Assembler::sarl(Register dst) {
2527   int encode = prefix_and_encode(dst->encoding());
2528   emit_byte(0xD3);
2529   emit_byte(0xF8 | encode);
2530 }
2531 
2532 void Assembler::sbbl(Address dst, int32_t imm32) {
2533   InstructionMark im(this);
2534   prefix(dst);
2535   emit_arith_operand(0x81, rbx, dst, imm32);
2536 }
2537 
2538 void Assembler::sbbl(Register dst, int32_t imm32) {
2539   prefix(dst);
2540   emit_arith(0x81, 0xD8, dst, imm32);
2541 }
2542 
2543 
2544 void Assembler::sbbl(Register dst, Address src) {
2545   InstructionMark im(this);
2546   prefix(src, dst);
2547   emit_byte(0x1B);
2548   emit_operand(dst, src);
2549 }
2550 
2551 void Assembler::sbbl(Register dst, Register src) {
2552   (void) prefix_and_encode(dst->encoding(), src->encoding());
2553   emit_arith(0x1B, 0xC0, dst, src);
2554 }
2555 
2556 void Assembler::setb(Condition cc, Register dst) {
2557   assert(0 <= cc && cc < 16, "illegal cc");
2558   int encode = prefix_and_encode(dst->encoding(), true);
2559   emit_byte(0x0F);
2560   emit_byte(0x90 | cc);
2561   emit_byte(0xC0 | encode);
2562 }
2563 
2564 void Assembler::shll(Register dst, int imm8) {
2565   assert(isShiftCount(imm8), "illegal shift count");
2566   int encode = prefix_and_encode(dst->encoding());
2567   if (imm8 == 1 ) {
2568     emit_byte(0xD1);
2569     emit_byte(0xE0 | encode);
2570   } else {
2571     emit_byte(0xC1);
2572     emit_byte(0xE0 | encode);
2573     emit_byte(imm8);
2574   }
2575 }
2576 
2577 void Assembler::shll(Register dst) {
2578   int encode = prefix_and_encode(dst->encoding());
2579   emit_byte(0xD3);
2580   emit_byte(0xE0 | encode);
2581 }
2582 
2583 void Assembler::shrl(Register dst, int imm8) {
2584   assert(isShiftCount(imm8), "illegal shift count");
2585   int encode = prefix_and_encode(dst->encoding());
2586   emit_byte(0xC1);
2587   emit_byte(0xE8 | encode);
2588   emit_byte(imm8);
2589 }
2590 
2591 void Assembler::shrl(Register dst) {
2592   int encode = prefix_and_encode(dst->encoding());
2593   emit_byte(0xD3);
2594   emit_byte(0xE8 | encode);
2595 }
2596 
2597 // copies a single word from [esi] to [edi]
2598 void Assembler::smovl() {
2599   emit_byte(0xA5);
2600 }
2601 
2602 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2603   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2604   emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
2605 }
2606 
2607 void Assembler::sqrtsd(XMMRegister dst, Address src) {
2608   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2609   emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
2610 }
2611 
2612 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2613   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2614   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
2615 }
2616 
2617 void Assembler::sqrtss(XMMRegister dst, Address src) {
2618   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2619   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
2620 }
2621 
2622 void Assembler::stmxcsr( Address dst) {
2623   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2624   InstructionMark im(this);
2625   prefix(dst);
2626   emit_byte(0x0F);
2627   emit_byte(0xAE);
2628   emit_operand(as_Register(3), dst);
2629 }
2630 
2631 void Assembler::subl(Address dst, int32_t imm32) {
2632   InstructionMark im(this);
2633   prefix(dst);
2634   emit_arith_operand(0x81, rbp, dst, imm32);
2635 }
2636 
2637 void Assembler::subl(Address dst, Register src) {
2638   InstructionMark im(this);
2639   prefix(dst, src);
2640   emit_byte(0x29);
2641   emit_operand(src, dst);
2642 }
2643 
2644 void Assembler::subl(Register dst, int32_t imm32) {
2645   prefix(dst);
2646   emit_arith(0x81, 0xE8, dst, imm32);
2647 }
2648 
2649 // Force generation of a 4 byte immediate value even if it fits into 8bit
2650 void Assembler::subl_imm32(Register dst, int32_t imm32) {
2651   prefix(dst);
2652   emit_arith_imm32(0x81, 0xE8, dst, imm32);
2653 }
2654 
2655 void Assembler::subl(Register dst, Address src) {
2656   InstructionMark im(this);
2657   prefix(src, dst);
2658   emit_byte(0x2B);
2659   emit_operand(dst, src);
2660 }
2661 
2662 void Assembler::subl(Register dst, Register src) {
2663   (void) prefix_and_encode(dst->encoding(), src->encoding());
2664   emit_arith(0x2B, 0xC0, dst, src);
2665 }
2666 
2667 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2668   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2669   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
2670 }
2671 
2672 void Assembler::subsd(XMMRegister dst, Address src) {
2673   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2674   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
2675 }
2676 
2677 void Assembler::subss(XMMRegister dst, XMMRegister src) {
2678   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2679   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
2680 }
2681 
2682 void Assembler::subss(XMMRegister dst, Address src) {
2683   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2684   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
2685 }
2686 
2687 void Assembler::testb(Register dst, int imm8) {
2688   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2689   (void) prefix_and_encode(dst->encoding(), true);
2690   emit_arith_b(0xF6, 0xC0, dst, imm8);
2691 }
2692 
2693 void Assembler::testl(Register dst, int32_t imm32) {
2694   // not using emit_arith because test
2695   // doesn't support sign-extension of
2696   // 8bit operands
2697   int encode = dst->encoding();
2698   if (encode == 0) {
2699     emit_byte(0xA9);
2700   } else {
2701     encode = prefix_and_encode(encode);
2702     emit_byte(0xF7);
2703     emit_byte(0xC0 | encode);
2704   }
2705   emit_long(imm32);
2706 }
2707 
2708 void Assembler::testl(Register dst, Register src) {
2709   (void) prefix_and_encode(dst->encoding(), src->encoding());
2710   emit_arith(0x85, 0xC0, dst, src);
2711 }
2712 
2713 void Assembler::testl(Register dst, Address  src) {
2714   InstructionMark im(this);
2715   prefix(src, dst);
2716   emit_byte(0x85);
2717   emit_operand(dst, src);
2718 }
2719 
2720 void Assembler::ucomisd(XMMRegister dst, Address src) {
2721   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2722   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
2723 }
2724 
2725 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2726   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2727   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
2728 }
2729 
2730 void Assembler::ucomiss(XMMRegister dst, Address src) {
2731   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2732   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
2733 }
2734 
2735 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2736   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2737   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
2738 }
2739 
2740 
2741 void Assembler::xaddl(Address dst, Register src) {
2742   InstructionMark im(this);
2743   prefix(dst, src);
2744   emit_byte(0x0F);
2745   emit_byte(0xC1);
2746   emit_operand(src, dst);
2747 }
2748 
2749 void Assembler::xchgl(Register dst, Address src) { // xchg
2750   InstructionMark im(this);
2751   prefix(src, dst);
2752   emit_byte(0x87);
2753   emit_operand(dst, src);
2754 }
2755 
2756 void Assembler::xchgl(Register dst, Register src) {
2757   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2758   emit_byte(0x87);
2759   emit_byte(0xc0 | encode);
2760 }
2761 
2762 void Assembler::xorl(Register dst, int32_t imm32) {
2763   prefix(dst);
2764   emit_arith(0x81, 0xF0, dst, imm32);
2765 }
2766 
2767 void Assembler::xorl(Register dst, Address src) {
2768   InstructionMark im(this);
2769   prefix(src, dst);
2770   emit_byte(0x33);
2771   emit_operand(dst, src);
2772 }
2773 
2774 void Assembler::xorl(Register dst, Register src) {
2775   (void) prefix_and_encode(dst->encoding(), src->encoding());
2776   emit_arith(0x33, 0xC0, dst, src);
2777 }
2778 
2779 
2780 // AVX 3-operands scalar float-point arithmetic instructions
2781 
2782 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
2783   assert(VM_Version::supports_avx(), "");
2784   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2785 }
2786 
2787 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2788   assert(VM_Version::supports_avx(), "");
2789   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2790 }
2791 
2792 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
2793   assert(VM_Version::supports_avx(), "");
2794   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2795 }
2796 
2797 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2798   assert(VM_Version::supports_avx(), "");
2799   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2800 }
2801 
2802 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
2803   assert(VM_Version::supports_avx(), "");
2804   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2805 }
2806 
2807 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2808   assert(VM_Version::supports_avx(), "");
2809   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2810 }
2811 
2812 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
2813   assert(VM_Version::supports_avx(), "");
2814   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2815 }
2816 
2817 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2818   assert(VM_Version::supports_avx(), "");
2819   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2820 }
2821 
2822 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
2823   assert(VM_Version::supports_avx(), "");
2824   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2825 }
2826 
2827 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2828   assert(VM_Version::supports_avx(), "");
2829   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2830 }
2831 
2832 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
2833   assert(VM_Version::supports_avx(), "");
2834   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2835 }
2836 
2837 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2838   assert(VM_Version::supports_avx(), "");
2839   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2840 }
2841 
2842 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
2843   assert(VM_Version::supports_avx(), "");
2844   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2845 }
2846 
2847 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2848   assert(VM_Version::supports_avx(), "");
2849   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2850 }
2851 
2852 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
2853   assert(VM_Version::supports_avx(), "");
2854   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2855 }
2856 
2857 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2858   assert(VM_Version::supports_avx(), "");
2859   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2860 }
2861 
2862 //====================VECTOR ARITHMETIC=====================================
2863 
2864 // Float-point vector arithmetic
2865 
2866 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
2867   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2868   emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
2869 }
2870 
2871 void Assembler::addps(XMMRegister dst, XMMRegister src) {
2872   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2873   emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
2874 }
2875 
2876 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2877   assert(VM_Version::supports_avx(), "");
2878   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
2879 }
2880 
2881 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2882   assert(VM_Version::supports_avx(), "");
2883   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
2884 }
2885 
2886 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2887   assert(VM_Version::supports_avx(), "");
2888   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
2889 }
2890 
2891 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2892   assert(VM_Version::supports_avx(), "");
2893   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
2894 }
2895 
2896 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
2897   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2898   emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
2899 }
2900 
2901 void Assembler::subps(XMMRegister dst, XMMRegister src) {
2902   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2903   emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
2904 }
2905 
2906 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2907   assert(VM_Version::supports_avx(), "");
2908   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
2909 }
2910 
2911 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2912   assert(VM_Version::supports_avx(), "");
2913   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
2914 }
2915 
2916 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2917   assert(VM_Version::supports_avx(), "");
2918   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
2919 }
2920 
2921 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2922   assert(VM_Version::supports_avx(), "");
2923   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
2924 }
2925 
2926 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
2927   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2928   emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
2929 }
2930 
2931 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
2932   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2933   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
2934 }
2935 
2936 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2937   assert(VM_Version::supports_avx(), "");
2938   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
2939 }
2940 
2941 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2942   assert(VM_Version::supports_avx(), "");
2943   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
2944 }
2945 
2946 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2947   assert(VM_Version::supports_avx(), "");
2948   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
2949 }
2950 
2951 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2952   assert(VM_Version::supports_avx(), "");
2953   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
2954 }
2955 
2956 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
2957   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2958   emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
2959 }
2960 
2961 void Assembler::divps(XMMRegister dst, XMMRegister src) {
2962   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2963   emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
2964 }
2965 
2966 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2967   assert(VM_Version::supports_avx(), "");
2968   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
2969 }
2970 
2971 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2972   assert(VM_Version::supports_avx(), "");
2973   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
2974 }
2975 
2976 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2977   assert(VM_Version::supports_avx(), "");
2978   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
2979 }
2980 
2981 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2982   assert(VM_Version::supports_avx(), "");
2983   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
2984 }
2985 
2986 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
2987   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2988   emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
2989 }
2990 
2991 void Assembler::andps(XMMRegister dst, XMMRegister src) {
2992   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2993   emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
2994 }
2995 
2996 void Assembler::andps(XMMRegister dst, Address src) {
2997   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2998   emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
2999 }
3000 
3001 void Assembler::andpd(XMMRegister dst, Address src) {
3002   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3003   emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
3004 }
3005 
3006 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3007   assert(VM_Version::supports_avx(), "");
3008   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
3009 }
3010 
3011 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3012   assert(VM_Version::supports_avx(), "");
3013   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
3014 }
3015 
3016 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3017   assert(VM_Version::supports_avx(), "");
3018   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
3019 }
3020 
3021 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3022   assert(VM_Version::supports_avx(), "");
3023   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
3024 }
3025 
3026 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
3027   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3028   emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
3029 }
3030 
3031 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
3032   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3033   emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
3034 }
3035 
3036 void Assembler::xorpd(XMMRegister dst, Address src) {
3037   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3038   emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
3039 }
3040 
3041 void Assembler::xorps(XMMRegister dst, Address src) {
3042   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3043   emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
3044 }
3045 
3046 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3047   assert(VM_Version::supports_avx(), "");
3048   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
3049 }
3050 
3051 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3052   assert(VM_Version::supports_avx(), "");
3053   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
3054 }
3055 
3056 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3057   assert(VM_Version::supports_avx(), "");
3058   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
3059 }
3060 
3061 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3062   assert(VM_Version::supports_avx(), "");
3063   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
3064 }
3065 
3066 
3067 // Integer vector arithmetic
3068 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
3069   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3070   emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
3071 }
3072 
3073 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
3074   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3075   emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
3076 }
3077 
3078 void Assembler::paddd(XMMRegister dst, XMMRegister src) {
3079   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3080   emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
3081 }
3082 
3083 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
3084   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3085   emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
3086 }
3087 
3088 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3089   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3090   emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
3091 }
3092 
3093 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3094   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3095   emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
3096 }
3097 
3098 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3099   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3100   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
3101 }
3102 
3103 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3104   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3105   emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
3106 }
3107 
3108 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3109   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3110   emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
3111 }
3112 
3113 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3114   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3115   emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
3116 }
3117 
3118 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3119   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3120   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
3121 }
3122 
3123 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3124   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3125   emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
3126 }
3127 
3128 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
3129   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3130   emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
3131 }
3132 
3133 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
3134   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3135   emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
3136 }
3137 
3138 void Assembler::psubd(XMMRegister dst, XMMRegister src) {
3139   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3140   emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
3141 }
3142 
3143 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
3144   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3145   emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
3146 }
3147 
3148 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3149   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3150   emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
3151 }
3152 
3153 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3154   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3155   emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
3156 }
3157 
3158 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3159   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3160   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
3161 }
3162 
3163 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3164   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3165   emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
3166 }
3167 
3168 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3169   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3170   emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
3171 }
3172 
3173 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3174   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3175   emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
3176 }
3177 
3178 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3179   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3180   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
3181 }
3182 
3183 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3184   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3185   emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
3186 }
3187 
3188 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
3189   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3190   emit_simd_arith(0xD5, dst, src, VEX_SIMD_66);
3191 }
3192 
3193 void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
3194   assert(VM_Version::supports_sse4_1(), "");
3195   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
3196   emit_byte(0x40);
3197   emit_byte(0xC0 | encode);
3198 }
3199 
3200 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3201   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3202   emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
3203 }
3204 
3205 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3206   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3207   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
3208   emit_byte(0x40);
3209   emit_byte(0xC0 | encode);
3210 }
3211 
3212 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3213   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3214   emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
3215 }
3216 
3217 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3218   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3219   InstructionMark im(this);
3220   int dst_enc = dst->encoding();
3221   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
3222   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
3223   emit_byte(0x40);
3224   emit_operand(dst, src);
3225 }
3226 
3227 // Shift packed integers left by specified number of bits.
3228 void Assembler::psllw(XMMRegister dst, int shift) {
3229   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3230   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
3231   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3232   emit_byte(0x71);
3233   emit_byte(0xC0 | encode);
3234   emit_byte(shift & 0xFF);
3235 }
3236 
3237 void Assembler::pslld(XMMRegister dst, int shift) {
3238   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3239   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
3240   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3241   emit_byte(0x72);
3242   emit_byte(0xC0 | encode);
3243   emit_byte(shift & 0xFF);
3244 }
3245 
3246 void Assembler::psllq(XMMRegister dst, int shift) {
3247   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3248   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
3249   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3250   emit_byte(0x73);
3251   emit_byte(0xC0 | encode);
3252   emit_byte(shift & 0xFF);
3253 }
3254 
3255 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
3256   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3257   emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66);
3258 }
3259 
3260 void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
3261   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3262   emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
3263 }
3264 
3265 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
3266   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3267   emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
3268 }
3269 
3270 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3271   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3272   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
3273   emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256);
3274   emit_byte(shift & 0xFF);
3275 }
3276 
3277 void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3278   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3279   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
3280   emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256);
3281   emit_byte(shift & 0xFF);
3282 }
3283 
3284 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3285   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3286   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
3287   emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256);
3288   emit_byte(shift & 0xFF);
3289 }
3290 
3291 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3292   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3293   emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256);
3294 }
3295 
3296 void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3297   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3298   emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256);
3299 }
3300 
3301 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3302   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3303   emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256);
3304 }
3305 
3306 // Shift packed integers logically right by specified number of bits.
3307 void Assembler::psrlw(XMMRegister dst, int shift) {
3308   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3309   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
3310   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3311   emit_byte(0x71);
3312   emit_byte(0xC0 | encode);
3313   emit_byte(shift & 0xFF);
3314 }
3315 
3316 void Assembler::psrld(XMMRegister dst, int shift) {
3317   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3318   // XMM2 is for /2 encoding: 66 0F 72 /2 ib
3319   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3320   emit_byte(0x72);
3321   emit_byte(0xC0 | encode);
3322   emit_byte(shift & 0xFF);
3323 }
3324 
3325 void Assembler::psrlq(XMMRegister dst, int shift) {
3326   // Do not confuse it with psrldq SSE2 instruction which
3327   // shifts 128 bit value in xmm register by number of bytes.
3328   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3329   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3330   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3331   emit_byte(0x73);
3332   emit_byte(0xC0 | encode);
3333   emit_byte(shift & 0xFF);
3334 }
3335 
3336 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
3337   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3338   emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66);
3339 }
3340 
3341 void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
3342   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3343   emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
3344 }
3345 
3346 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
3347   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3348   emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
3349 }
3350 
3351 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3352   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3353   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3354   emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256);
3355   emit_byte(shift & 0xFF);
3356 }
3357 
3358 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3359   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3360   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3361   emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256);
3362   emit_byte(shift & 0xFF);
3363 }
3364 
3365 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3366   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3367   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3368   emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256);
3369   emit_byte(shift & 0xFF);
3370 }
3371 
3372 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3373   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3374   emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256);
3375 }
3376 
3377 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3378   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3379   emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256);
3380 }
3381 
3382 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3383   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3384   emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256);
3385 }
3386 
3387 // Shift packed integers arithmetically right by specified number of bits.
3388 void Assembler::psraw(XMMRegister dst, int shift) {
3389   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3390   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3391   int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
3392   emit_byte(0x71);
3393   emit_byte(0xC0 | encode);
3394   emit_byte(shift & 0xFF);
3395 }
3396 
3397 void Assembler::psrad(XMMRegister dst, int shift) {
3398   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3399   // XMM4 is for /4 encoding: 66 0F 72 /4 ib
3400   int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
3401   emit_byte(0x72);
3402   emit_byte(0xC0 | encode);
3403   emit_byte(shift & 0xFF);
3404 }
3405 
3406 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
3407   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3408   emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66);
3409 }
3410 
3411 void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
3412   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3413   emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
3414 }
3415 
3416 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3417   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3418   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3419   emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256);
3420   emit_byte(shift & 0xFF);
3421 }
3422 
3423 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3424   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3425   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3426   emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256);
3427   emit_byte(shift & 0xFF);
3428 }
3429 
3430 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3431   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3432   emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256);
3433 }
3434 
3435 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3436   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3437   emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256);
3438 }
3439 
3440 
3441 // AND packed integers
3442 void Assembler::pand(XMMRegister dst, XMMRegister src) {
3443   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3444   emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
3445 }
3446 
3447 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3448   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3449   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
3450 }
3451 
3452 void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3453   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3454   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
3455 }
3456 
3457 void Assembler::por(XMMRegister dst, XMMRegister src) {
3458   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3459   emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
3460 }
3461 
3462 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3463   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3464   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
3465 }
3466 
3467 void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3468   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3469   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
3470 }
3471 
3472 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
3473   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3474   emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
3475 }
3476 
3477 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3478   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3479   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
3480 }
3481 
3482 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3483   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3484   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
3485 }
3486 
3487 
3488 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3489   assert(VM_Version::supports_avx(), "");
3490   bool vector256 = true;
3491   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3492   emit_byte(0x18);
3493   emit_byte(0xC0 | encode);
3494   // 0x00 - insert into lower 128 bits
3495   // 0x01 - insert into upper 128 bits
3496   emit_byte(0x01);
3497 }
3498 
3499 void Assembler::vinsertf128h(XMMRegister dst, Address src) {
3500   assert(VM_Version::supports_avx(), "");
3501   InstructionMark im(this);
3502   bool vector256 = true;
3503   assert(dst != xnoreg, "sanity");
3504   int dst_enc = dst->encoding();
3505   // swap src<->dst for encoding
3506   vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3507   emit_byte(0x18);
3508   emit_operand(dst, src);
3509   // 0x01 - insert into upper 128 bits
3510   emit_byte(0x01);
3511 }
3512 
3513 void Assembler::vextractf128h(Address dst, XMMRegister src) {
3514   assert(VM_Version::supports_avx(), "");
3515   InstructionMark im(this);
3516   bool vector256 = true;
3517   assert(src != xnoreg, "sanity");
3518   int src_enc = src->encoding();
3519   vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3520   emit_byte(0x19);
3521   emit_operand(src, dst);
3522   // 0x01 - extract from upper 128 bits
3523   emit_byte(0x01);
3524 }
3525 
3526 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3527   assert(VM_Version::supports_avx2(), "");
3528   bool vector256 = true;
3529   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3530   emit_byte(0x38);
3531   emit_byte(0xC0 | encode);
3532   // 0x00 - insert into lower 128 bits
3533   // 0x01 - insert into upper 128 bits
3534   emit_byte(0x01);
3535 }
3536 
3537 void Assembler::vinserti128h(XMMRegister dst, Address src) {
3538   assert(VM_Version::supports_avx2(), "");
3539   InstructionMark im(this);
3540   bool vector256 = true;
3541   assert(dst != xnoreg, "sanity");
3542   int dst_enc = dst->encoding();
3543   // swap src<->dst for encoding
3544   vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3545   emit_byte(0x38);
3546   emit_operand(dst, src);
3547   // 0x01 - insert into upper 128 bits
3548   emit_byte(0x01);
3549 }
3550 
3551 void Assembler::vextracti128h(Address dst, XMMRegister src) {
3552   assert(VM_Version::supports_avx2(), "");
3553   InstructionMark im(this);
3554   bool vector256 = true;
3555   assert(src != xnoreg, "sanity");
3556   int src_enc = src->encoding();
3557   vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
3558   emit_byte(0x39);
3559   emit_operand(src, dst);
3560   // 0x01 - extract from upper 128 bits
3561   emit_byte(0x01);
3562 }
3563 
3564 void Assembler::vzeroupper() {
3565   assert(VM_Version::supports_avx(), "");
3566   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
3567   emit_byte(0x77);
3568 }
3569 
3570 
3571 #ifndef _LP64
3572 // 32bit only pieces of the assembler
3573 
3574 void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
3575   // NO PREFIX AS NEVER 64BIT
3576   InstructionMark im(this);
3577   emit_byte(0x81);
3578   emit_byte(0xF8 | src1->encoding());
3579   emit_data(imm32, rspec, 0);
3580 }
3581 
3582 void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
3583   // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
3584   InstructionMark im(this);
3585   emit_byte(0x81);
3586   emit_operand(rdi, src1);
3587   emit_data(imm32, rspec, 0);
3588 }
3589 
3590 // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
3591 // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
3592 // into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
3593 void Assembler::cmpxchg8(Address adr) {
3594   InstructionMark im(this);
3595   emit_byte(0x0F);
3596   emit_byte(0xc7);
3597   emit_operand(rcx, adr);
3598 }
3599 
3600 void Assembler::decl(Register dst) {
3601   // Don't use it directly. Use MacroAssembler::decrementl() instead.
3602  emit_byte(0x48 | dst->encoding());
3603 }
3604 
3605 #endif // _LP64
3606 
3607 // 64bit typically doesn't use the x87 but needs to for the trig funcs
3608 
3609 void Assembler::fabs() {
3610   emit_byte(0xD9);
3611   emit_byte(0xE1);
3612 }
3613 
3614 void Assembler::fadd(int i) {
3615   emit_farith(0xD8, 0xC0, i);
3616 }
3617 
3618 void Assembler::fadd_d(Address src) {
3619   InstructionMark im(this);
3620   emit_byte(0xDC);
3621   emit_operand32(rax, src);
3622 }
3623 
3624 void Assembler::fadd_s(Address src) {
3625   InstructionMark im(this);
3626   emit_byte(0xD8);
3627   emit_operand32(rax, src);
3628 }
3629 
3630 void Assembler::fadda(int i) {
3631   emit_farith(0xDC, 0xC0, i);
3632 }
3633 
3634 void Assembler::faddp(int i) {
3635   emit_farith(0xDE, 0xC0, i);
3636 }
3637 
3638 void Assembler::fchs() {
3639   emit_byte(0xD9);
3640   emit_byte(0xE0);
3641 }
3642 
3643 void Assembler::fcom(int i) {
3644   emit_farith(0xD8, 0xD0, i);
3645 }
3646 
3647 void Assembler::fcomp(int i) {
3648   emit_farith(0xD8, 0xD8, i);
3649 }
3650 
3651 void Assembler::fcomp_d(Address src) {
3652   InstructionMark im(this);
3653   emit_byte(0xDC);
3654   emit_operand32(rbx, src);
3655 }
3656 
3657 void Assembler::fcomp_s(Address src) {
3658   InstructionMark im(this);
3659   emit_byte(0xD8);
3660   emit_operand32(rbx, src);
3661 }
3662 
3663 void Assembler::fcompp() {
3664   emit_byte(0xDE);
3665   emit_byte(0xD9);
3666 }
3667 
3668 void Assembler::fcos() {
3669   emit_byte(0xD9);
3670   emit_byte(0xFF);
3671 }
3672 
3673 void Assembler::fdecstp() {
3674   emit_byte(0xD9);
3675   emit_byte(0xF6);
3676 }
3677 
3678 void Assembler::fdiv(int i) {
3679   emit_farith(0xD8, 0xF0, i);
3680 }
3681 
3682 void Assembler::fdiv_d(Address src) {
3683   InstructionMark im(this);
3684   emit_byte(0xDC);
3685   emit_operand32(rsi, src);
3686 }
3687 
3688 void Assembler::fdiv_s(Address src) {
3689   InstructionMark im(this);
3690   emit_byte(0xD8);
3691   emit_operand32(rsi, src);
3692 }
3693 
3694 void Assembler::fdiva(int i) {
3695   emit_farith(0xDC, 0xF8, i);
3696 }
3697 
3698 // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3699 //       is erroneous for some of the floating-point instructions below.
3700 
3701 void Assembler::fdivp(int i) {
3702   emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3703 }
3704 
3705 void Assembler::fdivr(int i) {
3706   emit_farith(0xD8, 0xF8, i);
3707 }
3708 
3709 void Assembler::fdivr_d(Address src) {
3710   InstructionMark im(this);
3711   emit_byte(0xDC);
3712   emit_operand32(rdi, src);
3713 }
3714 
3715 void Assembler::fdivr_s(Address src) {
3716   InstructionMark im(this);
3717   emit_byte(0xD8);
3718   emit_operand32(rdi, src);
3719 }
3720 
3721 void Assembler::fdivra(int i) {
3722   emit_farith(0xDC, 0xF0, i);
3723 }
3724 
3725 void Assembler::fdivrp(int i) {
3726   emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3727 }
3728 
3729 void Assembler::ffree(int i) {
3730   emit_farith(0xDD, 0xC0, i);
3731 }
3732 
3733 void Assembler::fild_d(Address adr) {
3734   InstructionMark im(this);
3735   emit_byte(0xDF);
3736   emit_operand32(rbp, adr);
3737 }
3738 
3739 void Assembler::fild_s(Address adr) {
3740   InstructionMark im(this);
3741   emit_byte(0xDB);
3742   emit_operand32(rax, adr);
3743 }
3744 
3745 void Assembler::fincstp() {
3746   emit_byte(0xD9);
3747   emit_byte(0xF7);
3748 }
3749 
3750 void Assembler::finit() {
3751   emit_byte(0x9B);
3752   emit_byte(0xDB);
3753   emit_byte(0xE3);
3754 }
3755 
3756 void Assembler::fist_s(Address adr) {
3757   InstructionMark im(this);
3758   emit_byte(0xDB);
3759   emit_operand32(rdx, adr);
3760 }
3761 
3762 void Assembler::fistp_d(Address adr) {
3763   InstructionMark im(this);
3764   emit_byte(0xDF);
3765   emit_operand32(rdi, adr);
3766 }
3767 
3768 void Assembler::fistp_s(Address adr) {
3769   InstructionMark im(this);
3770   emit_byte(0xDB);
3771   emit_operand32(rbx, adr);
3772 }
3773 
3774 void Assembler::fld1() {
3775   emit_byte(0xD9);
3776   emit_byte(0xE8);
3777 }
3778 
3779 void Assembler::fld_d(Address adr) {
3780   InstructionMark im(this);
3781   emit_byte(0xDD);
3782   emit_operand32(rax, adr);
3783 }
3784 
3785 void Assembler::fld_s(Address adr) {
3786   InstructionMark im(this);
3787   emit_byte(0xD9);
3788   emit_operand32(rax, adr);
3789 }
3790 
3791 
3792 void Assembler::fld_s(int index) {
3793   emit_farith(0xD9, 0xC0, index);
3794 }
3795 
3796 void Assembler::fld_x(Address adr) {
3797   InstructionMark im(this);
3798   emit_byte(0xDB);
3799   emit_operand32(rbp, adr);
3800 }
3801 
3802 void Assembler::fldcw(Address src) {
3803   InstructionMark im(this);
3804   emit_byte(0xd9);
3805   emit_operand32(rbp, src);
3806 }
3807 
3808 void Assembler::fldenv(Address src) {
3809   InstructionMark im(this);
3810   emit_byte(0xD9);
3811   emit_operand32(rsp, src);
3812 }
3813 
3814 void Assembler::fldlg2() {
3815   emit_byte(0xD9);
3816   emit_byte(0xEC);
3817 }
3818 
3819 void Assembler::fldln2() {
3820   emit_byte(0xD9);
3821   emit_byte(0xED);
3822 }
3823 
3824 void Assembler::fldz() {
3825   emit_byte(0xD9);
3826   emit_byte(0xEE);
3827 }
3828 
3829 void Assembler::flog() {
3830   fldln2();
3831   fxch();
3832   fyl2x();
3833 }
3834 
3835 void Assembler::flog10() {
3836   fldlg2();
3837   fxch();
3838   fyl2x();
3839 }
3840 
3841 void Assembler::fmul(int i) {
3842   emit_farith(0xD8, 0xC8, i);
3843 }
3844 
3845 void Assembler::fmul_d(Address src) {
3846   InstructionMark im(this);
3847   emit_byte(0xDC);
3848   emit_operand32(rcx, src);
3849 }
3850 
3851 void Assembler::fmul_s(Address src) {
3852   InstructionMark im(this);
3853   emit_byte(0xD8);
3854   emit_operand32(rcx, src);
3855 }
3856 
3857 void Assembler::fmula(int i) {
3858   emit_farith(0xDC, 0xC8, i);
3859 }
3860 
3861 void Assembler::fmulp(int i) {
3862   emit_farith(0xDE, 0xC8, i);
3863 }
3864 
3865 void Assembler::fnsave(Address dst) {
3866   InstructionMark im(this);
3867   emit_byte(0xDD);
3868   emit_operand32(rsi, dst);
3869 }
3870 
3871 void Assembler::fnstcw(Address src) {
3872   InstructionMark im(this);
3873   emit_byte(0x9B);
3874   emit_byte(0xD9);
3875   emit_operand32(rdi, src);
3876 }
3877 
3878 void Assembler::fnstsw_ax() {
3879   emit_byte(0xdF);
3880   emit_byte(0xE0);
3881 }
3882 
3883 void Assembler::fprem() {
3884   emit_byte(0xD9);
3885   emit_byte(0xF8);
3886 }
3887 
3888 void Assembler::fprem1() {
3889   emit_byte(0xD9);
3890   emit_byte(0xF5);
3891 }
3892 
3893 void Assembler::frstor(Address src) {
3894   InstructionMark im(this);
3895   emit_byte(0xDD);
3896   emit_operand32(rsp, src);
3897 }
3898 
3899 void Assembler::fsin() {
3900   emit_byte(0xD9);
3901   emit_byte(0xFE);
3902 }
3903 
3904 void Assembler::fsqrt() {
3905   emit_byte(0xD9);
3906   emit_byte(0xFA);
3907 }
3908 
3909 void Assembler::fst_d(Address adr) {
3910   InstructionMark im(this);
3911   emit_byte(0xDD);
3912   emit_operand32(rdx, adr);
3913 }
3914 
3915 void Assembler::fst_s(Address adr) {
3916   InstructionMark im(this);
3917   emit_byte(0xD9);
3918   emit_operand32(rdx, adr);
3919 }
3920 
3921 void Assembler::fstp_d(Address adr) {
3922   InstructionMark im(this);
3923   emit_byte(0xDD);
3924   emit_operand32(rbx, adr);
3925 }
3926 
3927 void Assembler::fstp_d(int index) {
3928   emit_farith(0xDD, 0xD8, index);
3929 }
3930 
3931 void Assembler::fstp_s(Address adr) {
3932   InstructionMark im(this);
3933   emit_byte(0xD9);
3934   emit_operand32(rbx, adr);
3935 }
3936 
3937 void Assembler::fstp_x(Address adr) {
3938   InstructionMark im(this);
3939   emit_byte(0xDB);
3940   emit_operand32(rdi, adr);
3941 }
3942 
3943 void Assembler::fsub(int i) {
3944   emit_farith(0xD8, 0xE0, i);
3945 }
3946 
3947 void Assembler::fsub_d(Address src) {
3948   InstructionMark im(this);
3949   emit_byte(0xDC);
3950   emit_operand32(rsp, src);
3951 }
3952 
3953 void Assembler::fsub_s(Address src) {
3954   InstructionMark im(this);
3955   emit_byte(0xD8);
3956   emit_operand32(rsp, src);
3957 }
3958 
3959 void Assembler::fsuba(int i) {
3960   emit_farith(0xDC, 0xE8, i);
3961 }
3962 
3963 void Assembler::fsubp(int i) {
3964   emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3965 }
3966 
3967 void Assembler::fsubr(int i) {
3968   emit_farith(0xD8, 0xE8, i);
3969 }
3970 
3971 void Assembler::fsubr_d(Address src) {
3972   InstructionMark im(this);
3973   emit_byte(0xDC);
3974   emit_operand32(rbp, src);
3975 }
3976 
3977 void Assembler::fsubr_s(Address src) {
3978   InstructionMark im(this);
3979   emit_byte(0xD8);
3980   emit_operand32(rbp, src);
3981 }
3982 
3983 void Assembler::fsubra(int i) {
3984   emit_farith(0xDC, 0xE0, i);
3985 }
3986 
3987 void Assembler::fsubrp(int i) {
3988   emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3989 }
3990 
3991 void Assembler::ftan() {
3992   emit_byte(0xD9);
3993   emit_byte(0xF2);
3994   emit_byte(0xDD);
3995   emit_byte(0xD8);
3996 }
3997 
3998 void Assembler::ftst() {
3999   emit_byte(0xD9);
4000   emit_byte(0xE4);
4001 }
4002 
4003 void Assembler::fucomi(int i) {
4004   // make sure the instruction is supported (introduced for P6, together with cmov)
4005   guarantee(VM_Version::supports_cmov(), "illegal instruction");
4006   emit_farith(0xDB, 0xE8, i);
4007 }
4008 
4009 void Assembler::fucomip(int i) {
4010   // make sure the instruction is supported (introduced for P6, together with cmov)
4011   guarantee(VM_Version::supports_cmov(), "illegal instruction");
4012   emit_farith(0xDF, 0xE8, i);
4013 }
4014 
4015 void Assembler::fwait() {
4016   emit_byte(0x9B);
4017 }
4018 
4019 void Assembler::fxch(int i) {
4020   emit_farith(0xD9, 0xC8, i);
4021 }
4022 
4023 void Assembler::fyl2x() {
4024   emit_byte(0xD9);
4025   emit_byte(0xF1);
4026 }
4027 
4028 void Assembler::frndint() {
4029   emit_byte(0xD9);
4030   emit_byte(0xFC);
4031 }
4032 
4033 void Assembler::f2xm1() {
4034   emit_byte(0xD9);
4035   emit_byte(0xF0);
4036 }
4037 
4038 void Assembler::fldl2e() {
4039   emit_byte(0xD9);
4040   emit_byte(0xEA);
4041 }
4042 
4043 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
4044 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
4045 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
4046 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
4047 
4048 // Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
4049 void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
4050   if (pre > 0) {
4051     emit_byte(simd_pre[pre]);
4052   }
4053   if (rex_w) {
4054     prefixq(adr, xreg);
4055   } else {
4056     prefix(adr, xreg);
4057   }
4058   if (opc > 0) {
4059     emit_byte(0x0F);
4060     int opc2 = simd_opc[opc];
4061     if (opc2 > 0) {
4062       emit_byte(opc2);
4063     }
4064   }
4065 }
4066 
4067 int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
4068   if (pre > 0) {
4069     emit_byte(simd_pre[pre]);
4070   }
4071   int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
4072                           prefix_and_encode(dst_enc, src_enc);
4073   if (opc > 0) {
4074     emit_byte(0x0F);
4075     int opc2 = simd_opc[opc];
4076     if (opc2 > 0) {
4077       emit_byte(opc2);
4078     }
4079   }
4080   return encode;
4081 }
4082 
4083 
4084 void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
4085   if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
4086     prefix(VEX_3bytes);
4087 
4088     int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
4089     byte1 = (~byte1) & 0xE0;
4090     byte1 |= opc;
4091     a_byte(byte1);
4092 
4093     int byte2 = ((~nds_enc) & 0xf) << 3;
4094     byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
4095     emit_byte(byte2);
4096   } else {
4097     prefix(VEX_2bytes);
4098 
4099     int byte1 = vex_r ? VEX_R : 0;
4100     byte1 = (~byte1) & 0x80;
4101     byte1 |= ((~nds_enc) & 0xf) << 3;
4102     byte1 |= (vector256 ? 4 : 0) | pre;
4103     emit_byte(byte1);
4104   }
4105 }
4106 
4107 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
4108   bool vex_r = (xreg_enc >= 8);
4109   bool vex_b = adr.base_needs_rex();
4110   bool vex_x = adr.index_needs_rex();
4111   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
4112 }
4113 
4114 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
4115   bool vex_r = (dst_enc >= 8);
4116   bool vex_b = (src_enc >= 8);
4117   bool vex_x = false;
4118   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
4119   return (((dst_enc & 7) << 3) | (src_enc & 7));
4120 }
4121 
4122 
4123 void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
4124   if (UseAVX > 0) {
4125     int xreg_enc = xreg->encoding();
4126     int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
4127     vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
4128   } else {
4129     assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
4130     rex_prefix(adr, xreg, pre, opc, rex_w);
4131   }
4132 }
4133 
4134 int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
4135   int dst_enc = dst->encoding();
4136   int src_enc = src->encoding();
4137   if (UseAVX > 0) {
4138     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
4139     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
4140   } else {
4141     assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
4142     return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
4143   }
4144 }
4145 
4146 void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
4147   InstructionMark im(this);
4148   simd_prefix(dst, dst, src, pre);
4149   emit_byte(opcode);
4150   emit_operand(dst, src);
4151 }
4152 
4153 void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
4154   int encode = simd_prefix_and_encode(dst, dst, src, pre);
4155   emit_byte(opcode);
4156   emit_byte(0xC0 | encode);
4157 }
4158 
4159 // Versions with no second source register (non-destructive source).
4160 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
4161   InstructionMark im(this);
4162   simd_prefix(dst, xnoreg, src, pre);
4163   emit_byte(opcode);
4164   emit_operand(dst, src);
4165 }
4166 
4167 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
4168   int encode = simd_prefix_and_encode(dst, xnoreg, src, pre);
4169   emit_byte(opcode);
4170   emit_byte(0xC0 | encode);
4171 }
4172 
4173 // 3-operands AVX instructions
4174 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
4175                                Address src, VexSimdPrefix pre, bool vector256) {
4176   InstructionMark im(this);
4177   vex_prefix(dst, nds, src, pre, vector256);
4178   emit_byte(opcode);
4179   emit_operand(dst, src);
4180 }
4181 
4182 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
4183                                XMMRegister src, VexSimdPrefix pre, bool vector256) {
4184   int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256);
4185   emit_byte(opcode);
4186   emit_byte(0xC0 | encode);
4187 }
4188 
4189 #ifndef _LP64
4190 
4191 void Assembler::incl(Register dst) {
4192   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4193   emit_byte(0x40 | dst->encoding());
4194 }
4195 
4196 void Assembler::lea(Register dst, Address src) {
4197   leal(dst, src);
4198 }
4199 
4200 void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4201   InstructionMark im(this);
4202   emit_byte(0xC7);
4203   emit_operand(rax, dst);
4204   emit_data((int)imm32, rspec, 0);
4205 }
4206 
4207 void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4208   InstructionMark im(this);
4209   int encode = prefix_and_encode(dst->encoding());
4210   emit_byte(0xB8 | encode);
4211   emit_data((int)imm32, rspec, 0);
4212 }
4213 
4214 void Assembler::popa() { // 32bit
4215   emit_byte(0x61);
4216 }
4217 
4218 void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
4219   InstructionMark im(this);
4220   emit_byte(0x68);
4221   emit_data(imm32, rspec, 0);
4222 }
4223 
4224 void Assembler::pusha() { // 32bit
4225   emit_byte(0x60);
4226 }
4227 
4228 void Assembler::set_byte_if_not_zero(Register dst) {
4229   emit_byte(0x0F);
4230   emit_byte(0x95);
4231   emit_byte(0xE0 | dst->encoding());
4232 }
4233 
4234 void Assembler::shldl(Register dst, Register src) {
4235   emit_byte(0x0F);
4236   emit_byte(0xA5);
4237   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
4238 }
4239 
4240 void Assembler::shrdl(Register dst, Register src) {
4241   emit_byte(0x0F);
4242   emit_byte(0xAD);
4243   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
4244 }
4245 
4246 #else // LP64
4247 
4248 void Assembler::set_byte_if_not_zero(Register dst) {
4249   int enc = prefix_and_encode(dst->encoding(), true);
4250   emit_byte(0x0F);
4251   emit_byte(0x95);
4252   emit_byte(0xE0 | enc);
4253 }
4254 
4255 // 64bit only pieces of the assembler
4256 // This should only be used by 64bit instructions that can use rip-relative
4257 // it cannot be used by instructions that want an immediate value.
4258 
4259 bool Assembler::reachable(AddressLiteral adr) {
4260   int64_t disp;
4261   // None will force a 64bit literal to the code stream. Likely a placeholder
4262   // for something that will be patched later and we need to certain it will
4263   // always be reachable.
4264   if (adr.reloc() == relocInfo::none) {
4265     return false;
4266   }
4267   if (adr.reloc() == relocInfo::internal_word_type) {
4268     // This should be rip relative and easily reachable.
4269     return true;
4270   }
4271   if (adr.reloc() == relocInfo::virtual_call_type ||
4272       adr.reloc() == relocInfo::opt_virtual_call_type ||
4273       adr.reloc() == relocInfo::static_call_type ||
4274       adr.reloc() == relocInfo::static_stub_type ) {
4275     // This should be rip relative within the code cache and easily
4276     // reachable until we get huge code caches. (At which point
4277     // ic code is going to have issues).
4278     return true;
4279   }
4280   if (adr.reloc() != relocInfo::external_word_type &&
4281       adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
4282       adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
4283       adr.reloc() != relocInfo::runtime_call_type ) {
4284     return false;
4285   }
4286 
4287   // Stress the correction code
4288   if (ForceUnreachable) {
4289     // Must be runtimecall reloc, see if it is in the codecache
4290     // Flipping stuff in the codecache to be unreachable causes issues
4291     // with things like inline caches where the additional instructions
4292     // are not handled.
4293     if (CodeCache::find_blob(adr._target) == NULL) {
4294       return false;
4295     }
4296   }
4297   // For external_word_type/runtime_call_type if it is reachable from where we
4298   // are now (possibly a temp buffer) and where we might end up
4299   // anywhere in the codeCache then we are always reachable.
4300   // This would have to change if we ever save/restore shared code
4301   // to be more pessimistic.
4302   disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
4303   if (!is_simm32(disp)) return false;
4304   disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
4305   if (!is_simm32(disp)) return false;
4306 
4307   disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
4308 
4309   // Because rip relative is a disp + address_of_next_instruction and we
4310   // don't know the value of address_of_next_instruction we apply a fudge factor
4311   // to make sure we will be ok no matter the size of the instruction we get placed into.
4312   // We don't have to fudge the checks above here because they are already worst case.
4313 
4314   // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
4315   // + 4 because better safe than sorry.
4316   const int fudge = 12 + 4;
4317   if (disp < 0) {
4318     disp -= fudge;
4319   } else {
4320     disp += fudge;
4321   }
4322   return is_simm32(disp);
4323 }
4324 
4325 // Check if the polling page is not reachable from the code cache using rip-relative
4326 // addressing.
4327 bool Assembler::is_polling_page_far() {
4328   intptr_t addr = (intptr_t)os::get_polling_page();
4329   return ForceUnreachable ||
4330          !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
4331          !is_simm32(addr - (intptr_t)CodeCache::high_bound());
4332 }
4333 
4334 void Assembler::emit_data64(jlong data,
4335                             relocInfo::relocType rtype,
4336                             int format) {
4337   if (rtype == relocInfo::none) {
4338     emit_long64(data);
4339   } else {
4340     emit_data64(data, Relocation::spec_simple(rtype), format);
4341   }
4342 }
4343 
4344 void Assembler::emit_data64(jlong data,
4345                             RelocationHolder const& rspec,
4346                             int format) {
4347   assert(imm_operand == 0, "default format must be immediate in this file");
4348   assert(imm_operand == format, "must be immediate");
4349   assert(inst_mark() != NULL, "must be inside InstructionMark");
4350   // Do not use AbstractAssembler::relocate, which is not intended for
4351   // embedded words.  Instead, relocate to the enclosing instruction.
4352   code_section()->relocate(inst_mark(), rspec, format);
4353 #ifdef ASSERT
4354   check_relocation(rspec, format);
4355 #endif
4356   emit_long64(data);
4357 }
4358 
4359 int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
4360   if (reg_enc >= 8) {
4361     prefix(REX_B);
4362     reg_enc -= 8;
4363   } else if (byteinst && reg_enc >= 4) {
4364     prefix(REX);
4365   }
4366   return reg_enc;
4367 }
4368 
4369 int Assembler::prefixq_and_encode(int reg_enc) {
4370   if (reg_enc < 8) {
4371     prefix(REX_W);
4372   } else {
4373     prefix(REX_WB);
4374     reg_enc -= 8;
4375   }
4376   return reg_enc;
4377 }
4378 
4379 int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
4380   if (dst_enc < 8) {
4381     if (src_enc >= 8) {
4382       prefix(REX_B);
4383       src_enc -= 8;
4384     } else if (byteinst && src_enc >= 4) {
4385       prefix(REX);
4386     }
4387   } else {
4388     if (src_enc < 8) {
4389       prefix(REX_R);
4390     } else {
4391       prefix(REX_RB);
4392       src_enc -= 8;
4393     }
4394     dst_enc -= 8;
4395   }
4396   return dst_enc << 3 | src_enc;
4397 }
4398 
4399 int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
4400   if (dst_enc < 8) {
4401     if (src_enc < 8) {
4402       prefix(REX_W);
4403     } else {
4404       prefix(REX_WB);
4405       src_enc -= 8;
4406     }
4407   } else {
4408     if (src_enc < 8) {
4409       prefix(REX_WR);
4410     } else {
4411       prefix(REX_WRB);
4412       src_enc -= 8;
4413     }
4414     dst_enc -= 8;
4415   }
4416   return dst_enc << 3 | src_enc;
4417 }
4418 
4419 void Assembler::prefix(Register reg) {
4420   if (reg->encoding() >= 8) {
4421     prefix(REX_B);
4422   }
4423 }
4424 
4425 void Assembler::prefix(Address adr) {
4426   if (adr.base_needs_rex()) {
4427     if (adr.index_needs_rex()) {
4428       prefix(REX_XB);
4429     } else {
4430       prefix(REX_B);
4431     }
4432   } else {
4433     if (adr.index_needs_rex()) {
4434       prefix(REX_X);
4435     }
4436   }
4437 }
4438 
4439 void Assembler::prefixq(Address adr) {
4440   if (adr.base_needs_rex()) {
4441     if (adr.index_needs_rex()) {
4442       prefix(REX_WXB);
4443     } else {
4444       prefix(REX_WB);
4445     }
4446   } else {
4447     if (adr.index_needs_rex()) {
4448       prefix(REX_WX);
4449     } else {
4450       prefix(REX_W);
4451     }
4452   }
4453 }
4454 
4455 
4456 void Assembler::prefix(Address adr, Register reg, bool byteinst) {
4457   if (reg->encoding() < 8) {
4458     if (adr.base_needs_rex()) {
4459       if (adr.index_needs_rex()) {
4460         prefix(REX_XB);
4461       } else {
4462         prefix(REX_B);
4463       }
4464     } else {
4465       if (adr.index_needs_rex()) {
4466         prefix(REX_X);
4467       } else if (byteinst && reg->encoding() >= 4 ) {
4468         prefix(REX);
4469       }
4470     }
4471   } else {
4472     if (adr.base_needs_rex()) {
4473       if (adr.index_needs_rex()) {
4474         prefix(REX_RXB);
4475       } else {
4476         prefix(REX_RB);
4477       }
4478     } else {
4479       if (adr.index_needs_rex()) {
4480         prefix(REX_RX);
4481       } else {
4482         prefix(REX_R);
4483       }
4484     }
4485   }
4486 }
4487 
4488 void Assembler::prefixq(Address adr, Register src) {
4489   if (src->encoding() < 8) {
4490     if (adr.base_needs_rex()) {
4491       if (adr.index_needs_rex()) {
4492         prefix(REX_WXB);
4493       } else {
4494         prefix(REX_WB);
4495       }
4496     } else {
4497       if (adr.index_needs_rex()) {
4498         prefix(REX_WX);
4499       } else {
4500         prefix(REX_W);
4501       }
4502     }
4503   } else {
4504     if (adr.base_needs_rex()) {
4505       if (adr.index_needs_rex()) {
4506         prefix(REX_WRXB);
4507       } else {
4508         prefix(REX_WRB);
4509       }
4510     } else {
4511       if (adr.index_needs_rex()) {
4512         prefix(REX_WRX);
4513       } else {
4514         prefix(REX_WR);
4515       }
4516     }
4517   }
4518 }
4519 
4520 void Assembler::prefix(Address adr, XMMRegister reg) {
4521   if (reg->encoding() < 8) {
4522     if (adr.base_needs_rex()) {
4523       if (adr.index_needs_rex()) {
4524         prefix(REX_XB);
4525       } else {
4526         prefix(REX_B);
4527       }
4528     } else {
4529       if (adr.index_needs_rex()) {
4530         prefix(REX_X);
4531       }
4532     }
4533   } else {
4534     if (adr.base_needs_rex()) {
4535       if (adr.index_needs_rex()) {
4536         prefix(REX_RXB);
4537       } else {
4538         prefix(REX_RB);
4539       }
4540     } else {
4541       if (adr.index_needs_rex()) {
4542         prefix(REX_RX);
4543       } else {
4544         prefix(REX_R);
4545       }
4546     }
4547   }
4548 }
4549 
4550 void Assembler::prefixq(Address adr, XMMRegister src) {
4551   if (src->encoding() < 8) {
4552     if (adr.base_needs_rex()) {
4553       if (adr.index_needs_rex()) {
4554         prefix(REX_WXB);
4555       } else {
4556         prefix(REX_WB);
4557       }
4558     } else {
4559       if (adr.index_needs_rex()) {
4560         prefix(REX_WX);
4561       } else {
4562         prefix(REX_W);
4563       }
4564     }
4565   } else {
4566     if (adr.base_needs_rex()) {
4567       if (adr.index_needs_rex()) {
4568         prefix(REX_WRXB);
4569       } else {
4570         prefix(REX_WRB);
4571       }
4572     } else {
4573       if (adr.index_needs_rex()) {
4574         prefix(REX_WRX);
4575       } else {
4576         prefix(REX_WR);
4577       }
4578     }
4579   }
4580 }
4581 
4582 void Assembler::adcq(Register dst, int32_t imm32) {
4583   (void) prefixq_and_encode(dst->encoding());
4584   emit_arith(0x81, 0xD0, dst, imm32);
4585 }
4586 
4587 void Assembler::adcq(Register dst, Address src) {
4588   InstructionMark im(this);
4589   prefixq(src, dst);
4590   emit_byte(0x13);
4591   emit_operand(dst, src);
4592 }
4593 
4594 void Assembler::adcq(Register dst, Register src) {
4595   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4596   emit_arith(0x13, 0xC0, dst, src);
4597 }
4598 
4599 void Assembler::addq(Address dst, int32_t imm32) {
4600   InstructionMark im(this);
4601   prefixq(dst);
4602   emit_arith_operand(0x81, rax, dst,imm32);
4603 }
4604 
4605 void Assembler::addq(Address dst, Register src) {
4606   InstructionMark im(this);
4607   prefixq(dst, src);
4608   emit_byte(0x01);
4609   emit_operand(src, dst);
4610 }
4611 
4612 void Assembler::addq(Register dst, int32_t imm32) {
4613   (void) prefixq_and_encode(dst->encoding());
4614   emit_arith(0x81, 0xC0, dst, imm32);
4615 }
4616 
4617 void Assembler::addq(Register dst, Address src) {
4618   InstructionMark im(this);
4619   prefixq(src, dst);
4620   emit_byte(0x03);
4621   emit_operand(dst, src);
4622 }
4623 
4624 void Assembler::addq(Register dst, Register src) {
4625   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4626   emit_arith(0x03, 0xC0, dst, src);
4627 }
4628 
4629 void Assembler::andq(Address dst, int32_t imm32) {
4630   InstructionMark im(this);
4631   prefixq(dst);
4632   emit_byte(0x81);
4633   emit_operand(rsp, dst, 4);
4634   emit_long(imm32);
4635 }
4636 
4637 void Assembler::andq(Register dst, int32_t imm32) {
4638   (void) prefixq_and_encode(dst->encoding());
4639   emit_arith(0x81, 0xE0, dst, imm32);
4640 }
4641 
4642 void Assembler::andq(Register dst, Address src) {
4643   InstructionMark im(this);
4644   prefixq(src, dst);
4645   emit_byte(0x23);
4646   emit_operand(dst, src);
4647 }
4648 
4649 void Assembler::andq(Register dst, Register src) {
4650   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4651   emit_arith(0x23, 0xC0, dst, src);
4652 }
4653 
4654 void Assembler::bsfq(Register dst, Register src) {
4655   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4656   emit_byte(0x0F);
4657   emit_byte(0xBC);
4658   emit_byte(0xC0 | encode);
4659 }
4660 
4661 void Assembler::bsrq(Register dst, Register src) {
4662   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
4663   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4664   emit_byte(0x0F);
4665   emit_byte(0xBD);
4666   emit_byte(0xC0 | encode);
4667 }
4668 
4669 void Assembler::bswapq(Register reg) {
4670   int encode = prefixq_and_encode(reg->encoding());
4671   emit_byte(0x0F);
4672   emit_byte(0xC8 | encode);
4673 }
4674 
4675 void Assembler::cdqq() {
4676   prefix(REX_W);
4677   emit_byte(0x99);
4678 }
4679 
4680 void Assembler::clflush(Address adr) {
4681   prefix(adr);
4682   emit_byte(0x0F);
4683   emit_byte(0xAE);
4684   emit_operand(rdi, adr);
4685 }
4686 
4687 void Assembler::cmovq(Condition cc, Register dst, Register src) {
4688   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4689   emit_byte(0x0F);
4690   emit_byte(0x40 | cc);
4691   emit_byte(0xC0 | encode);
4692 }
4693 
4694 void Assembler::cmovq(Condition cc, Register dst, Address src) {
4695   InstructionMark im(this);
4696   prefixq(src, dst);
4697   emit_byte(0x0F);
4698   emit_byte(0x40 | cc);
4699   emit_operand(dst, src);
4700 }
4701 
4702 void Assembler::cmpq(Address dst, int32_t imm32) {
4703   InstructionMark im(this);
4704   prefixq(dst);
4705   emit_byte(0x81);
4706   emit_operand(rdi, dst, 4);
4707   emit_long(imm32);
4708 }
4709 
4710 void Assembler::cmpq(Register dst, int32_t imm32) {
4711   (void) prefixq_and_encode(dst->encoding());
4712   emit_arith(0x81, 0xF8, dst, imm32);
4713 }
4714 
4715 void Assembler::cmpq(Address dst, Register src) {
4716   InstructionMark im(this);
4717   prefixq(dst, src);
4718   emit_byte(0x3B);
4719   emit_operand(src, dst);
4720 }
4721 
4722 void Assembler::cmpq(Register dst, Register src) {
4723   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4724   emit_arith(0x3B, 0xC0, dst, src);
4725 }
4726 
4727 void Assembler::cmpq(Register dst, Address  src) {
4728   InstructionMark im(this);
4729   prefixq(src, dst);
4730   emit_byte(0x3B);
4731   emit_operand(dst, src);
4732 }
4733 
4734 void Assembler::cmpxchgq(Register reg, Address adr) {
4735   InstructionMark im(this);
4736   prefixq(adr, reg);
4737   emit_byte(0x0F);
4738   emit_byte(0xB1);
4739   emit_operand(reg, adr);
4740 }
4741 
4742 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
4743   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4744   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
4745   emit_byte(0x2A);
4746   emit_byte(0xC0 | encode);
4747 }
4748 
4749 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
4750   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4751   InstructionMark im(this);
4752   simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
4753   emit_byte(0x2A);
4754   emit_operand(dst, src);
4755 }
4756 
4757 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
4758   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4759   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
4760   emit_byte(0x2A);
4761   emit_byte(0xC0 | encode);
4762 }
4763 
4764 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
4765   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4766   InstructionMark im(this);
4767   simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
4768   emit_byte(0x2A);
4769   emit_operand(dst, src);
4770 }
4771 
4772 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
4773   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4774   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
4775   emit_byte(0x2C);
4776   emit_byte(0xC0 | encode);
4777 }
4778 
4779 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
4780   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4781   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
4782   emit_byte(0x2C);
4783   emit_byte(0xC0 | encode);
4784 }
4785 
4786 void Assembler::decl(Register dst) {
4787   // Don't use it directly. Use MacroAssembler::decrementl() instead.
4788   // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
4789   int encode = prefix_and_encode(dst->encoding());
4790   emit_byte(0xFF);
4791   emit_byte(0xC8 | encode);
4792 }
4793 
4794 void Assembler::decq(Register dst) {
4795   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4796   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4797   int encode = prefixq_and_encode(dst->encoding());
4798   emit_byte(0xFF);
4799   emit_byte(0xC8 | encode);
4800 }
4801 
4802 void Assembler::decq(Address dst) {
4803   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4804   InstructionMark im(this);
4805   prefixq(dst);
4806   emit_byte(0xFF);
4807   emit_operand(rcx, dst);
4808 }
4809 
4810 void Assembler::fxrstor(Address src) {
4811   prefixq(src);
4812   emit_byte(0x0F);
4813   emit_byte(0xAE);
4814   emit_operand(as_Register(1), src);
4815 }
4816 
4817 void Assembler::fxsave(Address dst) {
4818   prefixq(dst);
4819   emit_byte(0x0F);
4820   emit_byte(0xAE);
4821   emit_operand(as_Register(0), dst);
4822 }
4823 
4824 void Assembler::idivq(Register src) {
4825   int encode = prefixq_and_encode(src->encoding());
4826   emit_byte(0xF7);
4827   emit_byte(0xF8 | encode);
4828 }
4829 
4830 void Assembler::imulq(Register dst, Register src) {
4831   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4832   emit_byte(0x0F);
4833   emit_byte(0xAF);
4834   emit_byte(0xC0 | encode);
4835 }
4836 
4837 void Assembler::imulq(Register dst, Register src, int value) {
4838   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4839   if (is8bit(value)) {
4840     emit_byte(0x6B);
4841     emit_byte(0xC0 | encode);
4842     emit_byte(value & 0xFF);
4843   } else {
4844     emit_byte(0x69);
4845     emit_byte(0xC0 | encode);
4846     emit_long(value);
4847   }
4848 }
4849 
4850 void Assembler::incl(Register dst) {
4851   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4852   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4853   int encode = prefix_and_encode(dst->encoding());
4854   emit_byte(0xFF);
4855   emit_byte(0xC0 | encode);
4856 }
4857 
4858 void Assembler::incq(Register dst) {
4859   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4860   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4861   int encode = prefixq_and_encode(dst->encoding());
4862   emit_byte(0xFF);
4863   emit_byte(0xC0 | encode);
4864 }
4865 
4866 void Assembler::incq(Address dst) {
4867   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4868   InstructionMark im(this);
4869   prefixq(dst);
4870   emit_byte(0xFF);
4871   emit_operand(rax, dst);
4872 }
4873 
4874 void Assembler::lea(Register dst, Address src) {
4875   leaq(dst, src);
4876 }
4877 
4878 void Assembler::leaq(Register dst, Address src) {
4879   InstructionMark im(this);
4880   prefixq(src, dst);
4881   emit_byte(0x8D);
4882   emit_operand(dst, src);
4883 }
4884 
4885 void Assembler::mov64(Register dst, int64_t imm64) {
4886   InstructionMark im(this);
4887   int encode = prefixq_and_encode(dst->encoding());
4888   emit_byte(0xB8 | encode);
4889   emit_long64(imm64);
4890 }
4891 
4892 void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4893   InstructionMark im(this);
4894   int encode = prefixq_and_encode(dst->encoding());
4895   emit_byte(0xB8 | encode);
4896   emit_data64(imm64, rspec);
4897 }
4898 
4899 void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4900   InstructionMark im(this);
4901   int encode = prefix_and_encode(dst->encoding());
4902   emit_byte(0xB8 | encode);
4903   emit_data((int)imm32, rspec, narrow_oop_operand);
4904 }
4905 
4906 void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4907   InstructionMark im(this);
4908   prefix(dst);
4909   emit_byte(0xC7);
4910   emit_operand(rax, dst, 4);
4911   emit_data((int)imm32, rspec, narrow_oop_operand);
4912 }
4913 
4914 void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4915   InstructionMark im(this);
4916   int encode = prefix_and_encode(src1->encoding());
4917   emit_byte(0x81);
4918   emit_byte(0xF8 | encode);
4919   emit_data((int)imm32, rspec, narrow_oop_operand);
4920 }
4921 
4922 void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4923   InstructionMark im(this);
4924   prefix(src1);
4925   emit_byte(0x81);
4926   emit_operand(rax, src1, 4);
4927   emit_data((int)imm32, rspec, narrow_oop_operand);
4928 }
4929 
4930 void Assembler::lzcntq(Register dst, Register src) {
4931   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4932   emit_byte(0xF3);
4933   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4934   emit_byte(0x0F);
4935   emit_byte(0xBD);
4936   emit_byte(0xC0 | encode);
4937 }
4938 
4939 void Assembler::movdq(XMMRegister dst, Register src) {
4940   // table D-1 says MMX/SSE2
4941   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4942   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
4943   emit_byte(0x6E);
4944   emit_byte(0xC0 | encode);
4945 }
4946 
4947 void Assembler::movdq(Register dst, XMMRegister src) {
4948   // table D-1 says MMX/SSE2
4949   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4950   // swap src/dst to get correct prefix
4951   int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
4952   emit_byte(0x7E);
4953   emit_byte(0xC0 | encode);
4954 }
4955 
4956 void Assembler::movq(Register dst, Register src) {
4957   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4958   emit_byte(0x8B);
4959   emit_byte(0xC0 | encode);
4960 }
4961 
4962 void Assembler::movq(Register dst, Address src) {
4963   InstructionMark im(this);
4964   prefixq(src, dst);
4965   emit_byte(0x8B);
4966   emit_operand(dst, src);
4967 }
4968 
4969 void Assembler::movq(Address dst, Register src) {
4970   InstructionMark im(this);
4971   prefixq(dst, src);
4972   emit_byte(0x89);
4973   emit_operand(src, dst);
4974 }
4975 
4976 void Assembler::movsbq(Register dst, Address src) {
4977   InstructionMark im(this);
4978   prefixq(src, dst);
4979   emit_byte(0x0F);
4980   emit_byte(0xBE);
4981   emit_operand(dst, src);
4982 }
4983 
4984 void Assembler::movsbq(Register dst, Register src) {
4985   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4986   emit_byte(0x0F);
4987   emit_byte(0xBE);
4988   emit_byte(0xC0 | encode);
4989 }
4990 
4991 void Assembler::movslq(Register dst, int32_t imm32) {
4992   // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4993   // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4994   // as a result we shouldn't use until tested at runtime...
4995   ShouldNotReachHere();
4996   InstructionMark im(this);
4997   int encode = prefixq_and_encode(dst->encoding());
4998   emit_byte(0xC7 | encode);
4999   emit_long(imm32);
5000 }
5001 
5002 void Assembler::movslq(Address dst, int32_t imm32) {
5003   assert(is_simm32(imm32), "lost bits");
5004   InstructionMark im(this);
5005   prefixq(dst);
5006   emit_byte(0xC7);
5007   emit_operand(rax, dst, 4);
5008   emit_long(imm32);
5009 }
5010 
5011 void Assembler::movslq(Register dst, Address src) {
5012   InstructionMark im(this);
5013   prefixq(src, dst);
5014   emit_byte(0x63);
5015   emit_operand(dst, src);
5016 }
5017 
5018 void Assembler::movslq(Register dst, Register src) {
5019   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5020   emit_byte(0x63);
5021   emit_byte(0xC0 | encode);
5022 }
5023 
5024 void Assembler::movswq(Register dst, Address src) {
5025   InstructionMark im(this);
5026   prefixq(src, dst);
5027   emit_byte(0x0F);
5028   emit_byte(0xBF);
5029   emit_operand(dst, src);
5030 }
5031 
5032 void Assembler::movswq(Register dst, Register src) {
5033   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5034   emit_byte(0x0F);
5035   emit_byte(0xBF);
5036   emit_byte(0xC0 | encode);
5037 }
5038 
5039 void Assembler::movzbq(Register dst, Address src) {
5040   InstructionMark im(this);
5041   prefixq(src, dst);
5042   emit_byte(0x0F);
5043   emit_byte(0xB6);
5044   emit_operand(dst, src);
5045 }
5046 
5047 void Assembler::movzbq(Register dst, Register src) {
5048   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5049   emit_byte(0x0F);
5050   emit_byte(0xB6);
5051   emit_byte(0xC0 | encode);
5052 }
5053 
5054 void Assembler::movzwq(Register dst, Address src) {
5055   InstructionMark im(this);
5056   prefixq(src, dst);
5057   emit_byte(0x0F);
5058   emit_byte(0xB7);
5059   emit_operand(dst, src);
5060 }
5061 
5062 void Assembler::movzwq(Register dst, Register src) {
5063   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5064   emit_byte(0x0F);
5065   emit_byte(0xB7);
5066   emit_byte(0xC0 | encode);
5067 }
5068 
5069 void Assembler::negq(Register dst) {
5070   int encode = prefixq_and_encode(dst->encoding());
5071   emit_byte(0xF7);
5072   emit_byte(0xD8 | encode);
5073 }
5074 
5075 void Assembler::notq(Register dst) {
5076   int encode = prefixq_and_encode(dst->encoding());
5077   emit_byte(0xF7);
5078   emit_byte(0xD0 | encode);
5079 }
5080 
5081 void Assembler::orq(Address dst, int32_t imm32) {
5082   InstructionMark im(this);
5083   prefixq(dst);
5084   emit_byte(0x81);
5085   emit_operand(rcx, dst, 4);
5086   emit_long(imm32);
5087 }
5088 
5089 void Assembler::orq(Register dst, int32_t imm32) {
5090   (void) prefixq_and_encode(dst->encoding());
5091   emit_arith(0x81, 0xC8, dst, imm32);
5092 }
5093 
5094 void Assembler::orq(Register dst, Address src) {
5095   InstructionMark im(this);
5096   prefixq(src, dst);
5097   emit_byte(0x0B);
5098   emit_operand(dst, src);
5099 }
5100 
5101 void Assembler::orq(Register dst, Register src) {
5102   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5103   emit_arith(0x0B, 0xC0, dst, src);
5104 }
5105 
5106 void Assembler::popa() { // 64bit
5107   movq(r15, Address(rsp, 0));
5108   movq(r14, Address(rsp, wordSize));
5109   movq(r13, Address(rsp, 2 * wordSize));
5110   movq(r12, Address(rsp, 3 * wordSize));
5111   movq(r11, Address(rsp, 4 * wordSize));
5112   movq(r10, Address(rsp, 5 * wordSize));
5113   movq(r9,  Address(rsp, 6 * wordSize));
5114   movq(r8,  Address(rsp, 7 * wordSize));
5115   movq(rdi, Address(rsp, 8 * wordSize));
5116   movq(rsi, Address(rsp, 9 * wordSize));
5117   movq(rbp, Address(rsp, 10 * wordSize));
5118   // skip rsp
5119   movq(rbx, Address(rsp, 12 * wordSize));
5120   movq(rdx, Address(rsp, 13 * wordSize));
5121   movq(rcx, Address(rsp, 14 * wordSize));
5122   movq(rax, Address(rsp, 15 * wordSize));
5123 
5124   addq(rsp, 16 * wordSize);
5125 }
5126 
5127 void Assembler::popcntq(Register dst, Address src) {
5128   assert(VM_Version::supports_popcnt(), "must support");
5129   InstructionMark im(this);
5130   emit_byte(0xF3);
5131   prefixq(src, dst);
5132   emit_byte(0x0F);
5133   emit_byte(0xB8);
5134   emit_operand(dst, src);
5135 }
5136 
5137 void Assembler::popcntq(Register dst, Register src) {
5138   assert(VM_Version::supports_popcnt(), "must support");
5139   emit_byte(0xF3);
5140   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5141   emit_byte(0x0F);
5142   emit_byte(0xB8);
5143   emit_byte(0xC0 | encode);
5144 }
5145 
5146 void Assembler::popq(Address dst) {
5147   InstructionMark im(this);
5148   prefixq(dst);
5149   emit_byte(0x8F);
5150   emit_operand(rax, dst);
5151 }
5152 
5153 void Assembler::pusha() { // 64bit
5154   // we have to store original rsp.  ABI says that 128 bytes
5155   // below rsp are local scratch.
5156   movq(Address(rsp, -5 * wordSize), rsp);
5157 
5158   subq(rsp, 16 * wordSize);
5159 
5160   movq(Address(rsp, 15 * wordSize), rax);
5161   movq(Address(rsp, 14 * wordSize), rcx);
5162   movq(Address(rsp, 13 * wordSize), rdx);
5163   movq(Address(rsp, 12 * wordSize), rbx);
5164   // skip rsp
5165   movq(Address(rsp, 10 * wordSize), rbp);
5166   movq(Address(rsp, 9 * wordSize), rsi);
5167   movq(Address(rsp, 8 * wordSize), rdi);
5168   movq(Address(rsp, 7 * wordSize), r8);
5169   movq(Address(rsp, 6 * wordSize), r9);
5170   movq(Address(rsp, 5 * wordSize), r10);
5171   movq(Address(rsp, 4 * wordSize), r11);
5172   movq(Address(rsp, 3 * wordSize), r12);
5173   movq(Address(rsp, 2 * wordSize), r13);
5174   movq(Address(rsp, wordSize), r14);
5175   movq(Address(rsp, 0), r15);
5176 }
5177 
5178 void Assembler::pushq(Address src) {
5179   InstructionMark im(this);
5180   prefixq(src);
5181   emit_byte(0xFF);
5182   emit_operand(rsi, src);
5183 }
5184 
5185 void Assembler::rclq(Register dst, int imm8) {
5186   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5187   int encode = prefixq_and_encode(dst->encoding());
5188   if (imm8 == 1) {
5189     emit_byte(0xD1);
5190     emit_byte(0xD0 | encode);
5191   } else {
5192     emit_byte(0xC1);
5193     emit_byte(0xD0 | encode);
5194     emit_byte(imm8);
5195   }
5196 }
5197 void Assembler::sarq(Register dst, int imm8) {
5198   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5199   int encode = prefixq_and_encode(dst->encoding());
5200   if (imm8 == 1) {
5201     emit_byte(0xD1);
5202     emit_byte(0xF8 | encode);
5203   } else {
5204     emit_byte(0xC1);
5205     emit_byte(0xF8 | encode);
5206     emit_byte(imm8);
5207   }
5208 }
5209 
5210 void Assembler::sarq(Register dst) {
5211   int encode = prefixq_and_encode(dst->encoding());
5212   emit_byte(0xD3);
5213   emit_byte(0xF8 | encode);
5214 }
5215 
5216 void Assembler::sbbq(Address dst, int32_t imm32) {
5217   InstructionMark im(this);
5218   prefixq(dst);
5219   emit_arith_operand(0x81, rbx, dst, imm32);
5220 }
5221 
5222 void Assembler::sbbq(Register dst, int32_t imm32) {
5223   (void) prefixq_and_encode(dst->encoding());
5224   emit_arith(0x81, 0xD8, dst, imm32);
5225 }
5226 
5227 void Assembler::sbbq(Register dst, Address src) {
5228   InstructionMark im(this);
5229   prefixq(src, dst);
5230   emit_byte(0x1B);
5231   emit_operand(dst, src);
5232 }
5233 
5234 void Assembler::sbbq(Register dst, Register src) {
5235   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5236   emit_arith(0x1B, 0xC0, dst, src);
5237 }
5238 
5239 void Assembler::shlq(Register dst, int imm8) {
5240   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5241   int encode = prefixq_and_encode(dst->encoding());
5242   if (imm8 == 1) {
5243     emit_byte(0xD1);
5244     emit_byte(0xE0 | encode);
5245   } else {
5246     emit_byte(0xC1);
5247     emit_byte(0xE0 | encode);
5248     emit_byte(imm8);
5249   }
5250 }
5251 
5252 void Assembler::shlq(Register dst) {
5253   int encode = prefixq_and_encode(dst->encoding());
5254   emit_byte(0xD3);
5255   emit_byte(0xE0 | encode);
5256 }
5257 
5258 void Assembler::shrq(Register dst, int imm8) {
5259   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5260   int encode = prefixq_and_encode(dst->encoding());
5261   emit_byte(0xC1);
5262   emit_byte(0xE8 | encode);
5263   emit_byte(imm8);
5264 }
5265 
5266 void Assembler::shrq(Register dst) {
5267   int encode = prefixq_and_encode(dst->encoding());
5268   emit_byte(0xD3);
5269   emit_byte(0xE8 | encode);
5270 }
5271 
5272 void Assembler::subq(Address dst, int32_t imm32) {
5273   InstructionMark im(this);
5274   prefixq(dst);
5275   emit_arith_operand(0x81, rbp, dst, imm32);
5276 }
5277 
5278 void Assembler::subq(Address dst, Register src) {
5279   InstructionMark im(this);
5280   prefixq(dst, src);
5281   emit_byte(0x29);
5282   emit_operand(src, dst);
5283 }
5284 
5285 void Assembler::subq(Register dst, int32_t imm32) {
5286   (void) prefixq_and_encode(dst->encoding());
5287   emit_arith(0x81, 0xE8, dst, imm32);
5288 }
5289 
5290 // Force generation of a 4 byte immediate value even if it fits into 8bit
5291 void Assembler::subq_imm32(Register dst, int32_t imm32) {
5292   (void) prefixq_and_encode(dst->encoding());
5293   emit_arith_imm32(0x81, 0xE8, dst, imm32);
5294 }
5295 
5296 void Assembler::subq(Register dst, Address src) {
5297   InstructionMark im(this);
5298   prefixq(src, dst);
5299   emit_byte(0x2B);
5300   emit_operand(dst, src);
5301 }
5302 
5303 void Assembler::subq(Register dst, Register src) {
5304   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5305   emit_arith(0x2B, 0xC0, dst, src);
5306 }
5307 
5308 void Assembler::testq(Register dst, int32_t imm32) {
5309   // not using emit_arith because test
5310   // doesn't support sign-extension of
5311   // 8bit operands
5312   int encode = dst->encoding();
5313   if (encode == 0) {
5314     prefix(REX_W);
5315     emit_byte(0xA9);
5316   } else {
5317     encode = prefixq_and_encode(encode);
5318     emit_byte(0xF7);
5319     emit_byte(0xC0 | encode);
5320   }
5321   emit_long(imm32);
5322 }
5323 
5324 void Assembler::testq(Register dst, Register src) {
5325   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5326   emit_arith(0x85, 0xC0, dst, src);
5327 }
5328 
5329 void Assembler::xaddq(Address dst, Register src) {
5330   InstructionMark im(this);
5331   prefixq(dst, src);
5332   emit_byte(0x0F);
5333   emit_byte(0xC1);
5334   emit_operand(src, dst);
5335 }
5336 
5337 void Assembler::xchgq(Register dst, Address src) {
5338   InstructionMark im(this);
5339   prefixq(src, dst);
5340   emit_byte(0x87);
5341   emit_operand(dst, src);
5342 }
5343 
5344 void Assembler::xchgq(Register dst, Register src) {
5345   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5346   emit_byte(0x87);
5347   emit_byte(0xc0 | encode);
5348 }
5349 
5350 void Assembler::xorq(Register dst, Register src) {
5351   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5352   emit_arith(0x33, 0xC0, dst, src);
5353 }
5354 
5355 void Assembler::xorq(Register dst, Address src) {
5356   InstructionMark im(this);
5357   prefixq(src, dst);
5358   emit_byte(0x33);
5359   emit_operand(dst, src);
5360 }
5361 
5362 #endif // !LP64
5363 
5364 static Assembler::Condition reverse[] = {
5365     Assembler::noOverflow     /* overflow      = 0x0 */ ,
5366     Assembler::overflow       /* noOverflow    = 0x1 */ ,
5367     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
5368     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
5369     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
5370     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
5371     Assembler::above          /* belowEqual    = 0x6 */ ,
5372     Assembler::belowEqual     /* above         = 0x7 */ ,
5373     Assembler::positive       /* negative      = 0x8 */ ,
5374     Assembler::negative       /* positive      = 0x9 */ ,
5375     Assembler::noParity       /* parity        = 0xa */ ,
5376     Assembler::parity         /* noParity      = 0xb */ ,
5377     Assembler::greaterEqual   /* less          = 0xc */ ,
5378     Assembler::less           /* greaterEqual  = 0xd */ ,
5379     Assembler::greater        /* lessEqual     = 0xe */ ,
5380     Assembler::lessEqual      /* greater       = 0xf, */
5381 
5382 };
5383 
5384 
5385 // Implementation of MacroAssembler
5386 
5387 // First all the versions that have distinct versions depending on 32/64 bit
5388 // Unless the difference is trivial (1 line or so).
5389 
5390 #ifndef _LP64
5391 
5392 // 32bit versions
5393 
5394 Address MacroAssembler::as_Address(AddressLiteral adr) {
5395   return Address(adr.target(), adr.rspec());
5396 }
5397 
5398 Address MacroAssembler::as_Address(ArrayAddress adr) {
5399   return Address::make_array(adr);
5400 }
5401 
5402 int MacroAssembler::biased_locking_enter(Register lock_reg,
5403                                          Register obj_reg,
5404                                          Register swap_reg,
5405                                          Register tmp_reg,
5406                                          bool swap_reg_contains_mark,
5407                                          Label& done,
5408                                          Label* slow_case,
5409                                          BiasedLockingCounters* counters) {
5410   assert(UseBiasedLocking, "why call this otherwise?");
5411   assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
5412   assert_different_registers(lock_reg, obj_reg, swap_reg);
5413 
5414   if (PrintBiasedLockingStatistics && counters == NULL)
5415     counters = BiasedLocking::counters();
5416 
5417   bool need_tmp_reg = false;
5418   if (tmp_reg == noreg) {
5419     need_tmp_reg = true;
5420     tmp_reg = lock_reg;
5421   } else {
5422     assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5423   }
5424   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5425   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5426   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
5427   Address saved_mark_addr(lock_reg, 0);
5428 
5429   // Biased locking
5430   // See whether the lock is currently biased toward our thread and
5431   // whether the epoch is still valid
5432   // Note that the runtime guarantees sufficient alignment of JavaThread
5433   // pointers to allow age to be placed into low bits
5434   // First check to see whether biasing is even enabled for this object
5435   Label cas_label;
5436   int null_check_offset = -1;
5437   if (!swap_reg_contains_mark) {
5438     null_check_offset = offset();
5439     movl(swap_reg, mark_addr);
5440   }
5441   if (need_tmp_reg) {
5442     push(tmp_reg);
5443   }
5444   movl(tmp_reg, swap_reg);
5445   andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5446   cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
5447   if (need_tmp_reg) {
5448     pop(tmp_reg);
5449   }
5450   jcc(Assembler::notEqual, cas_label);
5451   // The bias pattern is present in the object's header. Need to check
5452   // whether the bias owner and the epoch are both still current.
5453   // Note that because there is no current thread register on x86 we
5454   // need to store off the mark word we read out of the object to
5455   // avoid reloading it and needing to recheck invariants below. This
5456   // store is unfortunate but it makes the overall code shorter and
5457   // simpler.
5458   movl(saved_mark_addr, swap_reg);
5459   if (need_tmp_reg) {
5460     push(tmp_reg);
5461   }
5462   get_thread(tmp_reg);
5463   xorl(swap_reg, tmp_reg);
5464   if (swap_reg_contains_mark) {
5465     null_check_offset = offset();
5466   }
5467   movl(tmp_reg, klass_addr);
5468   xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5469   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
5470   if (need_tmp_reg) {
5471     pop(tmp_reg);
5472   }
5473   if (counters != NULL) {
5474     cond_inc32(Assembler::zero,
5475                ExternalAddress((address)counters->biased_lock_entry_count_addr()));
5476   }
5477   jcc(Assembler::equal, done);
5478 
5479   Label try_revoke_bias;
5480   Label try_rebias;
5481 
5482   // At this point we know that the header has the bias pattern and
5483   // that we are not the bias owner in the current epoch. We need to
5484   // figure out more details about the state of the header in order to
5485   // know what operations can be legally performed on the object's
5486   // header.
5487 
5488   // If the low three bits in the xor result aren't clear, that means
5489   // the prototype header is no longer biased and we have to revoke
5490   // the bias on this object.
5491   testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
5492   jcc(Assembler::notZero, try_revoke_bias);
5493 
5494   // Biasing is still enabled for this data type. See whether the
5495   // epoch of the current bias is still valid, meaning that the epoch
5496   // bits of the mark word are equal to the epoch bits of the
5497   // prototype header. (Note that the prototype header's epoch bits
5498   // only change at a safepoint.) If not, attempt to rebias the object
5499   // toward the current thread. Note that we must be absolutely sure
5500   // that the current epoch is invalid in order to do this because
5501   // otherwise the manipulations it performs on the mark word are
5502   // illegal.
5503   testl(swap_reg, markOopDesc::epoch_mask_in_place);
5504   jcc(Assembler::notZero, try_rebias);
5505 
5506   // The epoch of the current bias is still valid but we know nothing
5507   // about the owner; it might be set or it might be clear. Try to
5508   // acquire the bias of the object using an atomic operation. If this
5509   // fails we will go in to the runtime to revoke the object's bias.
5510   // Note that we first construct the presumed unbiased header so we
5511   // don't accidentally blow away another thread's valid bias.
5512   movl(swap_reg, saved_mark_addr);
5513   andl(swap_reg,
5514        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5515   if (need_tmp_reg) {
5516     push(tmp_reg);
5517   }
5518   get_thread(tmp_reg);
5519   orl(tmp_reg, swap_reg);
5520   if (os::is_MP()) {
5521     lock();
5522   }
5523   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5524   if (need_tmp_reg) {
5525     pop(tmp_reg);
5526   }
5527   // If the biasing toward our thread failed, this means that
5528   // another thread succeeded in biasing it toward itself and we
5529   // need to revoke that bias. The revocation will occur in the
5530   // interpreter runtime in the slow case.
5531   if (counters != NULL) {
5532     cond_inc32(Assembler::zero,
5533                ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
5534   }
5535   if (slow_case != NULL) {
5536     jcc(Assembler::notZero, *slow_case);
5537   }
5538   jmp(done);
5539 
5540   bind(try_rebias);
5541   // At this point we know the epoch has expired, meaning that the
5542   // current "bias owner", if any, is actually invalid. Under these
5543   // circumstances _only_, we are allowed to use the current header's
5544   // value as the comparison value when doing the cas to acquire the
5545   // bias in the current epoch. In other words, we allow transfer of
5546   // the bias from one thread to another directly in this situation.
5547   //
5548   // FIXME: due to a lack of registers we currently blow away the age
5549   // bits in this situation. Should attempt to preserve them.
5550   if (need_tmp_reg) {
5551     push(tmp_reg);
5552   }
5553   get_thread(tmp_reg);
5554   movl(swap_reg, klass_addr);
5555   orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
5556   movl(swap_reg, saved_mark_addr);
5557   if (os::is_MP()) {
5558     lock();
5559   }
5560   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5561   if (need_tmp_reg) {
5562     pop(tmp_reg);
5563   }
5564   // If the biasing toward our thread failed, then another thread
5565   // succeeded in biasing it toward itself and we need to revoke that
5566   // bias. The revocation will occur in the runtime in the slow case.
5567   if (counters != NULL) {
5568     cond_inc32(Assembler::zero,
5569                ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
5570   }
5571   if (slow_case != NULL) {
5572     jcc(Assembler::notZero, *slow_case);
5573   }
5574   jmp(done);
5575 
5576   bind(try_revoke_bias);
5577   // The prototype mark in the klass doesn't have the bias bit set any
5578   // more, indicating that objects of this data type are not supposed
5579   // to be biased any more. We are going to try to reset the mark of
5580   // this object to the prototype value and fall through to the
5581   // CAS-based locking scheme. Note that if our CAS fails, it means
5582   // that another thread raced us for the privilege of revoking the
5583   // bias of this particular object, so it's okay to continue in the
5584   // normal locking code.
5585   //
5586   // FIXME: due to a lack of registers we currently blow away the age
5587   // bits in this situation. Should attempt to preserve them.
5588   movl(swap_reg, saved_mark_addr);
5589   if (need_tmp_reg) {
5590     push(tmp_reg);
5591   }
5592   movl(tmp_reg, klass_addr);
5593   movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5594   if (os::is_MP()) {
5595     lock();
5596   }
5597   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5598   if (need_tmp_reg) {
5599     pop(tmp_reg);
5600   }
5601   // Fall through to the normal CAS-based lock, because no matter what
5602   // the result of the above CAS, some thread must have succeeded in
5603   // removing the bias bit from the object's header.
5604   if (counters != NULL) {
5605     cond_inc32(Assembler::zero,
5606                ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
5607   }
5608 
5609   bind(cas_label);
5610 
5611   return null_check_offset;
5612 }
5613 void MacroAssembler::call_VM_leaf_base(address entry_point,
5614                                        int number_of_arguments) {
5615   call(RuntimeAddress(entry_point));
5616   increment(rsp, number_of_arguments * wordSize);
5617 }
5618 
5619 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
5620   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5621 }
5622 
5623 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
5624   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5625 }
5626 
5627 void MacroAssembler::cmpoop(Address src1, jobject obj) {
5628   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5629 }
5630 
5631 void MacroAssembler::cmpoop(Register src1, jobject obj) {
5632   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5633 }
5634 
5635 void MacroAssembler::extend_sign(Register hi, Register lo) {
5636   // According to Intel Doc. AP-526, "Integer Divide", p.18.
5637   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
5638     cdql();
5639   } else {
5640     movl(hi, lo);
5641     sarl(hi, 31);
5642   }
5643 }
5644 
5645 void MacroAssembler::jC2(Register tmp, Label& L) {
5646   // set parity bit if FPU flag C2 is set (via rax)
5647   save_rax(tmp);
5648   fwait(); fnstsw_ax();
5649   sahf();
5650   restore_rax(tmp);
5651   // branch
5652   jcc(Assembler::parity, L);
5653 }
5654 
5655 void MacroAssembler::jnC2(Register tmp, Label& L) {
5656   // set parity bit if FPU flag C2 is set (via rax)
5657   save_rax(tmp);
5658   fwait(); fnstsw_ax();
5659   sahf();
5660   restore_rax(tmp);
5661   // branch
5662   jcc(Assembler::noParity, L);
5663 }
5664 
5665 // 32bit can do a case table jump in one instruction but we no longer allow the base
5666 // to be installed in the Address class
5667 void MacroAssembler::jump(ArrayAddress entry) {
5668   jmp(as_Address(entry));
5669 }
5670 
5671 // Note: y_lo will be destroyed
5672 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5673   // Long compare for Java (semantics as described in JVM spec.)
5674   Label high, low, done;
5675 
5676   cmpl(x_hi, y_hi);
5677   jcc(Assembler::less, low);
5678   jcc(Assembler::greater, high);
5679   // x_hi is the return register
5680   xorl(x_hi, x_hi);
5681   cmpl(x_lo, y_lo);
5682   jcc(Assembler::below, low);
5683   jcc(Assembler::equal, done);
5684 
5685   bind(high);
5686   xorl(x_hi, x_hi);
5687   increment(x_hi);
5688   jmp(done);
5689 
5690   bind(low);
5691   xorl(x_hi, x_hi);
5692   decrementl(x_hi);
5693 
5694   bind(done);
5695 }
5696 
5697 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5698     mov_literal32(dst, (int32_t)src.target(), src.rspec());
5699 }
5700 
5701 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5702   // leal(dst, as_Address(adr));
5703   // see note in movl as to why we must use a move
5704   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
5705 }
5706 
5707 void MacroAssembler::leave() {
5708   mov(rsp, rbp);
5709   pop(rbp);
5710 }
5711 
5712 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
5713   // Multiplication of two Java long values stored on the stack
5714   // as illustrated below. Result is in rdx:rax.
5715   //
5716   // rsp ---> [  ??  ] \               \
5717   //            ....    | y_rsp_offset  |
5718   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
5719   //          [ y_hi ]                  | (in bytes)
5720   //            ....                    |
5721   //          [ x_lo ]                 /
5722   //          [ x_hi ]
5723   //            ....
5724   //
5725   // Basic idea: lo(result) = lo(x_lo * y_lo)
5726   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
5727   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
5728   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
5729   Label quick;
5730   // load x_hi, y_hi and check if quick
5731   // multiplication is possible
5732   movl(rbx, x_hi);
5733   movl(rcx, y_hi);
5734   movl(rax, rbx);
5735   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
5736   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
5737   // do full multiplication
5738   // 1st step
5739   mull(y_lo);                                    // x_hi * y_lo
5740   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
5741   // 2nd step
5742   movl(rax, x_lo);
5743   mull(rcx);                                     // x_lo * y_hi
5744   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
5745   // 3rd step
5746   bind(quick);                                   // note: rbx, = 0 if quick multiply!
5747   movl(rax, x_lo);
5748   mull(y_lo);                                    // x_lo * y_lo
5749   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
5750 }
5751 
5752 void MacroAssembler::lneg(Register hi, Register lo) {
5753   negl(lo);
5754   adcl(hi, 0);
5755   negl(hi);
5756 }
5757 
5758 void MacroAssembler::lshl(Register hi, Register lo) {
5759   // Java shift left long support (semantics as described in JVM spec., p.305)
5760   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
5761   // shift value is in rcx !
5762   assert(hi != rcx, "must not use rcx");
5763   assert(lo != rcx, "must not use rcx");
5764   const Register s = rcx;                        // shift count
5765   const int      n = BitsPerWord;
5766   Label L;
5767   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5768   cmpl(s, n);                                    // if (s < n)
5769   jcc(Assembler::less, L);                       // else (s >= n)
5770   movl(hi, lo);                                  // x := x << n
5771   xorl(lo, lo);
5772   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5773   bind(L);                                       // s (mod n) < n
5774   shldl(hi, lo);                                 // x := x << s
5775   shll(lo);
5776 }
5777 
5778 
5779 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
5780   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
5781   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
5782   assert(hi != rcx, "must not use rcx");
5783   assert(lo != rcx, "must not use rcx");
5784   const Register s = rcx;                        // shift count
5785   const int      n = BitsPerWord;
5786   Label L;
5787   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5788   cmpl(s, n);                                    // if (s < n)
5789   jcc(Assembler::less, L);                       // else (s >= n)
5790   movl(lo, hi);                                  // x := x >> n
5791   if (sign_extension) sarl(hi, 31);
5792   else                xorl(hi, hi);
5793   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5794   bind(L);                                       // s (mod n) < n
5795   shrdl(lo, hi);                                 // x := x >> s
5796   if (sign_extension) sarl(hi);
5797   else                shrl(hi);
5798 }
5799 
5800 void MacroAssembler::movoop(Register dst, jobject obj) {
5801   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5802 }
5803 
5804 void MacroAssembler::movoop(Address dst, jobject obj) {
5805   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5806 }
5807 
5808 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
5809   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5810 }
5811 
5812 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
5813   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
5814 }
5815 
5816 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5817   if (src.is_lval()) {
5818     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
5819   } else {
5820     movl(dst, as_Address(src));
5821   }
5822 }
5823 
5824 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5825   movl(as_Address(dst), src);
5826 }
5827 
5828 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5829   movl(dst, as_Address(src));
5830 }
5831 
5832 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5833 void MacroAssembler::movptr(Address dst, intptr_t src) {
5834   movl(dst, src);
5835 }
5836 
5837 
5838 void MacroAssembler::pop_callee_saved_registers() {
5839   pop(rcx);
5840   pop(rdx);
5841   pop(rdi);
5842   pop(rsi);
5843 }
5844 
5845 void MacroAssembler::pop_fTOS() {
5846   fld_d(Address(rsp, 0));
5847   addl(rsp, 2 * wordSize);
5848 }
5849 
5850 void MacroAssembler::push_callee_saved_registers() {
5851   push(rsi);
5852   push(rdi);
5853   push(rdx);
5854   push(rcx);
5855 }
5856 
5857 void MacroAssembler::push_fTOS() {
5858   subl(rsp, 2 * wordSize);
5859   fstp_d(Address(rsp, 0));
5860 }
5861 
5862 
5863 void MacroAssembler::pushoop(jobject obj) {
5864   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5865 }
5866 
5867 void MacroAssembler::pushklass(Metadata* obj) {
5868   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
5869 }
5870 
5871 void MacroAssembler::pushptr(AddressLiteral src) {
5872   if (src.is_lval()) {
5873     push_literal32((int32_t)src.target(), src.rspec());
5874   } else {
5875     pushl(as_Address(src));
5876   }
5877 }
5878 
5879 void MacroAssembler::set_word_if_not_zero(Register dst) {
5880   xorl(dst, dst);
5881   set_byte_if_not_zero(dst);
5882 }
5883 
5884 static void pass_arg0(MacroAssembler* masm, Register arg) {
5885   masm->push(arg);
5886 }
5887 
5888 static void pass_arg1(MacroAssembler* masm, Register arg) {
5889   masm->push(arg);
5890 }
5891 
5892 static void pass_arg2(MacroAssembler* masm, Register arg) {
5893   masm->push(arg);
5894 }
5895 
5896 static void pass_arg3(MacroAssembler* masm, Register arg) {
5897   masm->push(arg);
5898 }
5899 
5900 #ifndef PRODUCT
5901 extern "C" void findpc(intptr_t x);
5902 #endif
5903 
5904 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5905   // In order to get locks to work, we need to fake a in_VM state
5906   JavaThread* thread = JavaThread::current();
5907   JavaThreadState saved_state = thread->thread_state();
5908   thread->set_thread_state(_thread_in_vm);
5909   if (ShowMessageBoxOnError) {
5910     JavaThread* thread = JavaThread::current();
5911     JavaThreadState saved_state = thread->thread_state();
5912     thread->set_thread_state(_thread_in_vm);
5913     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5914       ttyLocker ttyl;
5915       BytecodeCounter::print();
5916     }
5917     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5918     // This is the value of eip which points to where verify_oop will return.
5919     if (os::message_box(msg, "Execution stopped, print registers?")) {
5920       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
5921       BREAKPOINT;
5922     }
5923   } else {
5924     ttyLocker ttyl;
5925     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5926   }
5927   // Don't assert holding the ttyLock
5928     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5929   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5930 }
5931 
5932 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
5933   ttyLocker ttyl;
5934   FlagSetting fs(Debugging, true);
5935   tty->print_cr("eip = 0x%08x", eip);
5936 #ifndef PRODUCT
5937   if ((WizardMode || Verbose) && PrintMiscellaneous) {
5938     tty->cr();
5939     findpc(eip);
5940     tty->cr();
5941   }
5942 #endif
5943 #define PRINT_REG(rax) \
5944   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
5945   PRINT_REG(rax);
5946   PRINT_REG(rbx);
5947   PRINT_REG(rcx);
5948   PRINT_REG(rdx);
5949   PRINT_REG(rdi);
5950   PRINT_REG(rsi);
5951   PRINT_REG(rbp);
5952   PRINT_REG(rsp);
5953 #undef PRINT_REG
5954   // Print some words near top of staack.
5955   int* dump_sp = (int*) rsp;
5956   for (int col1 = 0; col1 < 8; col1++) {
5957     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
5958     os::print_location(tty, *dump_sp++);
5959   }
5960   for (int row = 0; row < 16; row++) {
5961     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
5962     for (int col = 0; col < 8; col++) {
5963       tty->print(" 0x%08x", *dump_sp++);
5964     }
5965     tty->cr();
5966   }
5967   // Print some instructions around pc:
5968   Disassembler::decode((address)eip-64, (address)eip);
5969   tty->print_cr("--------");
5970   Disassembler::decode((address)eip, (address)eip+32);
5971 }
5972 
5973 void MacroAssembler::stop(const char* msg) {
5974   ExternalAddress message((address)msg);
5975   // push address of message
5976   pushptr(message.addr());
5977   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5978   pusha();                                            // push registers
5979   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5980   hlt();
5981 }
5982 
5983 void MacroAssembler::warn(const char* msg) {
5984   push_CPU_state();
5985 
5986   ExternalAddress message((address) msg);
5987   // push address of message
5988   pushptr(message.addr());
5989 
5990   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5991   addl(rsp, wordSize);       // discard argument
5992   pop_CPU_state();
5993 }
5994 
5995 void MacroAssembler::print_state() {
5996   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5997   pusha();                                            // push registers
5998 
5999   push_CPU_state();
6000   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
6001   pop_CPU_state();
6002 
6003   popa();
6004   addl(rsp, wordSize);
6005 }
6006 
6007 #else // _LP64
6008 
6009 // 64 bit versions
6010 
6011 Address MacroAssembler::as_Address(AddressLiteral adr) {
6012   // amd64 always does this as a pc-rel
6013   // we can be absolute or disp based on the instruction type
6014   // jmp/call are displacements others are absolute
6015   assert(!adr.is_lval(), "must be rval");
6016   assert(reachable(adr), "must be");
6017   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
6018 
6019 }
6020 
6021 Address MacroAssembler::as_Address(ArrayAddress adr) {
6022   AddressLiteral base = adr.base();
6023   lea(rscratch1, base);
6024   Address index = adr.index();
6025   assert(index._disp == 0, "must not have disp"); // maybe it can?
6026   Address array(rscratch1, index._index, index._scale, index._disp);
6027   return array;
6028 }
6029 
6030 int MacroAssembler::biased_locking_enter(Register lock_reg,
6031                                          Register obj_reg,
6032                                          Register swap_reg,
6033                                          Register tmp_reg,
6034                                          bool swap_reg_contains_mark,
6035                                          Label& done,
6036                                          Label* slow_case,
6037                                          BiasedLockingCounters* counters) {
6038   assert(UseBiasedLocking, "why call this otherwise?");
6039   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
6040   assert(tmp_reg != noreg, "tmp_reg must be supplied");
6041   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
6042   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
6043   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
6044   Address saved_mark_addr(lock_reg, 0);
6045 
6046   if (PrintBiasedLockingStatistics && counters == NULL)
6047     counters = BiasedLocking::counters();
6048 
6049   // Biased locking
6050   // See whether the lock is currently biased toward our thread and
6051   // whether the epoch is still valid
6052   // Note that the runtime guarantees sufficient alignment of JavaThread
6053   // pointers to allow age to be placed into low bits
6054   // First check to see whether biasing is even enabled for this object
6055   Label cas_label;
6056   int null_check_offset = -1;
6057   if (!swap_reg_contains_mark) {
6058     null_check_offset = offset();
6059     movq(swap_reg, mark_addr);
6060   }
6061   movq(tmp_reg, swap_reg);
6062   andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
6063   cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
6064   jcc(Assembler::notEqual, cas_label);
6065   // The bias pattern is present in the object's header. Need to check
6066   // whether the bias owner and the epoch are both still current.
6067   load_prototype_header(tmp_reg, obj_reg);
6068   orq(tmp_reg, r15_thread);
6069   xorq(tmp_reg, swap_reg);
6070   andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
6071   if (counters != NULL) {
6072     cond_inc32(Assembler::zero,
6073                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
6074   }
6075   jcc(Assembler::equal, done);
6076 
6077   Label try_revoke_bias;
6078   Label try_rebias;
6079 
6080   // At this point we know that the header has the bias pattern and
6081   // that we are not the bias owner in the current epoch. We need to
6082   // figure out more details about the state of the header in order to
6083   // know what operations can be legally performed on the object's
6084   // header.
6085 
6086   // If the low three bits in the xor result aren't clear, that means
6087   // the prototype header is no longer biased and we have to revoke
6088   // the bias on this object.
6089   testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
6090   jcc(Assembler::notZero, try_revoke_bias);
6091 
6092   // Biasing is still enabled for this data type. See whether the
6093   // epoch of the current bias is still valid, meaning that the epoch
6094   // bits of the mark word are equal to the epoch bits of the
6095   // prototype header. (Note that the prototype header's epoch bits
6096   // only change at a safepoint.) If not, attempt to rebias the object
6097   // toward the current thread. Note that we must be absolutely sure
6098   // that the current epoch is invalid in order to do this because
6099   // otherwise the manipulations it performs on the mark word are
6100   // illegal.
6101   testq(tmp_reg, markOopDesc::epoch_mask_in_place);
6102   jcc(Assembler::notZero, try_rebias);
6103 
6104   // The epoch of the current bias is still valid but we know nothing
6105   // about the owner; it might be set or it might be clear. Try to
6106   // acquire the bias of the object using an atomic operation. If this
6107   // fails we will go in to the runtime to revoke the object's bias.
6108   // Note that we first construct the presumed unbiased header so we
6109   // don't accidentally blow away another thread's valid bias.
6110   andq(swap_reg,
6111        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
6112   movq(tmp_reg, swap_reg);
6113   orq(tmp_reg, r15_thread);
6114   if (os::is_MP()) {
6115     lock();
6116   }
6117   cmpxchgq(tmp_reg, Address(obj_reg, 0));
6118   // If the biasing toward our thread failed, this means that
6119   // another thread succeeded in biasing it toward itself and we
6120   // need to revoke that bias. The revocation will occur in the
6121   // interpreter runtime in the slow case.
6122   if (counters != NULL) {
6123     cond_inc32(Assembler::zero,
6124                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
6125   }
6126   if (slow_case != NULL) {
6127     jcc(Assembler::notZero, *slow_case);
6128   }
6129   jmp(done);
6130 
6131   bind(try_rebias);
6132   // At this point we know the epoch has expired, meaning that the
6133   // current "bias owner", if any, is actually invalid. Under these
6134   // circumstances _only_, we are allowed to use the current header's
6135   // value as the comparison value when doing the cas to acquire the
6136   // bias in the current epoch. In other words, we allow transfer of
6137   // the bias from one thread to another directly in this situation.
6138   //
6139   // FIXME: due to a lack of registers we currently blow away the age
6140   // bits in this situation. Should attempt to preserve them.
6141   load_prototype_header(tmp_reg, obj_reg);
6142   orq(tmp_reg, r15_thread);
6143   if (os::is_MP()) {
6144     lock();
6145   }
6146   cmpxchgq(tmp_reg, Address(obj_reg, 0));
6147   // If the biasing toward our thread failed, then another thread
6148   // succeeded in biasing it toward itself and we need to revoke that
6149   // bias. The revocation will occur in the runtime in the slow case.
6150   if (counters != NULL) {
6151     cond_inc32(Assembler::zero,
6152                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
6153   }
6154   if (slow_case != NULL) {
6155     jcc(Assembler::notZero, *slow_case);
6156   }
6157   jmp(done);
6158 
6159   bind(try_revoke_bias);
6160   // The prototype mark in the klass doesn't have the bias bit set any
6161   // more, indicating that objects of this data type are not supposed
6162   // to be biased any more. We are going to try to reset the mark of
6163   // this object to the prototype value and fall through to the
6164   // CAS-based locking scheme. Note that if our CAS fails, it means
6165   // that another thread raced us for the privilege of revoking the
6166   // bias of this particular object, so it's okay to continue in the
6167   // normal locking code.
6168   //
6169   // FIXME: due to a lack of registers we currently blow away the age
6170   // bits in this situation. Should attempt to preserve them.
6171   load_prototype_header(tmp_reg, obj_reg);
6172   if (os::is_MP()) {
6173     lock();
6174   }
6175   cmpxchgq(tmp_reg, Address(obj_reg, 0));
6176   // Fall through to the normal CAS-based lock, because no matter what
6177   // the result of the above CAS, some thread must have succeeded in
6178   // removing the bias bit from the object's header.
6179   if (counters != NULL) {
6180     cond_inc32(Assembler::zero,
6181                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
6182   }
6183 
6184   bind(cas_label);
6185 
6186   return null_check_offset;
6187 }
6188 
6189 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
6190   Label L, E;
6191 
6192 #ifdef _WIN64
6193   // Windows always allocates space for it's register args
6194   assert(num_args <= 4, "only register arguments supported");
6195   subq(rsp,  frame::arg_reg_save_area_bytes);
6196 #endif
6197 
6198   // Align stack if necessary
6199   testl(rsp, 15);
6200   jcc(Assembler::zero, L);
6201 
6202   subq(rsp, 8);
6203   {
6204     call(RuntimeAddress(entry_point));
6205   }
6206   addq(rsp, 8);
6207   jmp(E);
6208 
6209   bind(L);
6210   {
6211     call(RuntimeAddress(entry_point));
6212   }
6213 
6214   bind(E);
6215 
6216 #ifdef _WIN64
6217   // restore stack pointer
6218   addq(rsp, frame::arg_reg_save_area_bytes);
6219 #endif
6220 
6221 }
6222 
6223 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
6224   assert(!src2.is_lval(), "should use cmpptr");
6225 
6226   if (reachable(src2)) {
6227     cmpq(src1, as_Address(src2));
6228   } else {
6229     lea(rscratch1, src2);
6230     Assembler::cmpq(src1, Address(rscratch1, 0));
6231   }
6232 }
6233 
6234 int MacroAssembler::corrected_idivq(Register reg) {
6235   // Full implementation of Java ldiv and lrem; checks for special
6236   // case as described in JVM spec., p.243 & p.271.  The function
6237   // returns the (pc) offset of the idivl instruction - may be needed
6238   // for implicit exceptions.
6239   //
6240   //         normal case                           special case
6241   //
6242   // input : rax: dividend                         min_long
6243   //         reg: divisor   (may not be eax/edx)   -1
6244   //
6245   // output: rax: quotient  (= rax idiv reg)       min_long
6246   //         rdx: remainder (= rax irem reg)       0
6247   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
6248   static const int64_t min_long = 0x8000000000000000;
6249   Label normal_case, special_case;
6250 
6251   // check for special case
6252   cmp64(rax, ExternalAddress((address) &min_long));
6253   jcc(Assembler::notEqual, normal_case);
6254   xorl(rdx, rdx); // prepare rdx for possible special case (where
6255                   // remainder = 0)
6256   cmpq(reg, -1);
6257   jcc(Assembler::equal, special_case);
6258 
6259   // handle normal case
6260   bind(normal_case);
6261   cdqq();
6262   int idivq_offset = offset();
6263   idivq(reg);
6264 
6265   // normal and special case exit
6266   bind(special_case);
6267 
6268   return idivq_offset;
6269 }
6270 
6271 void MacroAssembler::decrementq(Register reg, int value) {
6272   if (value == min_jint) { subq(reg, value); return; }
6273   if (value <  0) { incrementq(reg, -value); return; }
6274   if (value == 0) {                        ; return; }
6275   if (value == 1 && UseIncDec) { decq(reg) ; return; }
6276   /* else */      { subq(reg, value)       ; return; }
6277 }
6278 
6279 void MacroAssembler::decrementq(Address dst, int value) {
6280   if (value == min_jint) { subq(dst, value); return; }
6281   if (value <  0) { incrementq(dst, -value); return; }
6282   if (value == 0) {                        ; return; }
6283   if (value == 1 && UseIncDec) { decq(dst) ; return; }
6284   /* else */      { subq(dst, value)       ; return; }
6285 }
6286 
6287 void MacroAssembler::incrementq(Register reg, int value) {
6288   if (value == min_jint) { addq(reg, value); return; }
6289   if (value <  0) { decrementq(reg, -value); return; }
6290   if (value == 0) {                        ; return; }
6291   if (value == 1 && UseIncDec) { incq(reg) ; return; }
6292   /* else */      { addq(reg, value)       ; return; }
6293 }
6294 
6295 void MacroAssembler::incrementq(Address dst, int value) {
6296   if (value == min_jint) { addq(dst, value); return; }
6297   if (value <  0) { decrementq(dst, -value); return; }
6298   if (value == 0) {                        ; return; }
6299   if (value == 1 && UseIncDec) { incq(dst) ; return; }
6300   /* else */      { addq(dst, value)       ; return; }
6301 }
6302 
6303 // 32bit can do a case table jump in one instruction but we no longer allow the base
6304 // to be installed in the Address class
6305 void MacroAssembler::jump(ArrayAddress entry) {
6306   lea(rscratch1, entry.base());
6307   Address dispatch = entry.index();
6308   assert(dispatch._base == noreg, "must be");
6309   dispatch._base = rscratch1;
6310   jmp(dispatch);
6311 }
6312 
6313 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
6314   ShouldNotReachHere(); // 64bit doesn't use two regs
6315   cmpq(x_lo, y_lo);
6316 }
6317 
6318 void MacroAssembler::lea(Register dst, AddressLiteral src) {
6319     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
6320 }
6321 
6322 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
6323   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
6324   movptr(dst, rscratch1);
6325 }
6326 
6327 void MacroAssembler::leave() {
6328   // %%% is this really better? Why not on 32bit too?
6329   emit_byte(0xC9); // LEAVE
6330 }
6331 
6332 void MacroAssembler::lneg(Register hi, Register lo) {
6333   ShouldNotReachHere(); // 64bit doesn't use two regs
6334   negq(lo);
6335 }
6336 
6337 void MacroAssembler::movoop(Register dst, jobject obj) {
6338   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
6339 }
6340 
6341 void MacroAssembler::movoop(Address dst, jobject obj) {
6342   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
6343   movq(dst, rscratch1);
6344 }
6345 
6346 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
6347   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
6348 }
6349 
6350 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
6351   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
6352   movq(dst, rscratch1);
6353 }
6354 
6355 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
6356   if (src.is_lval()) {
6357     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
6358   } else {
6359     if (reachable(src)) {
6360       movq(dst, as_Address(src));
6361     } else {
6362       lea(rscratch1, src);
6363       movq(dst, Address(rscratch1,0));
6364     }
6365   }
6366 }
6367 
6368 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
6369   movq(as_Address(dst), src);
6370 }
6371 
6372 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
6373   movq(dst, as_Address(src));
6374 }
6375 
6376 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
6377 void MacroAssembler::movptr(Address dst, intptr_t src) {
6378   mov64(rscratch1, src);
6379   movq(dst, rscratch1);
6380 }
6381 
6382 // These are mostly for initializing NULL
6383 void MacroAssembler::movptr(Address dst, int32_t src) {
6384   movslq(dst, src);
6385 }
6386 
6387 void MacroAssembler::movptr(Register dst, int32_t src) {
6388   mov64(dst, (intptr_t)src);
6389 }
6390 
6391 void MacroAssembler::pushoop(jobject obj) {
6392   movoop(rscratch1, obj);
6393   push(rscratch1);
6394 }
6395 
6396 void MacroAssembler::pushklass(Metadata* obj) {
6397   mov_metadata(rscratch1, obj);
6398   push(rscratch1);
6399 }
6400 
6401 void MacroAssembler::pushptr(AddressLiteral src) {
6402   lea(rscratch1, src);
6403   if (src.is_lval()) {
6404     push(rscratch1);
6405   } else {
6406     pushq(Address(rscratch1, 0));
6407   }
6408 }
6409 
6410 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
6411                                            bool clear_pc) {
6412   // we must set sp to zero to clear frame
6413   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
6414   // must clear fp, so that compiled frames are not confused; it is
6415   // possible that we need it only for debugging
6416   if (clear_fp) {
6417     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
6418   }
6419 
6420   if (clear_pc) {
6421     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
6422   }
6423 }
6424 
6425 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
6426                                          Register last_java_fp,
6427                                          address  last_java_pc) {
6428   // determine last_java_sp register
6429   if (!last_java_sp->is_valid()) {
6430     last_java_sp = rsp;
6431   }
6432 
6433   // last_java_fp is optional
6434   if (last_java_fp->is_valid()) {
6435     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
6436            last_java_fp);
6437   }
6438 
6439   // last_java_pc is optional
6440   if (last_java_pc != NULL) {
6441     Address java_pc(r15_thread,
6442                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
6443     lea(rscratch1, InternalAddress(last_java_pc));
6444     movptr(java_pc, rscratch1);
6445   }
6446 
6447   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
6448 }
6449 
6450 static void pass_arg0(MacroAssembler* masm, Register arg) {
6451   if (c_rarg0 != arg ) {
6452     masm->mov(c_rarg0, arg);
6453   }
6454 }
6455 
6456 static void pass_arg1(MacroAssembler* masm, Register arg) {
6457   if (c_rarg1 != arg ) {
6458     masm->mov(c_rarg1, arg);
6459   }
6460 }
6461 
6462 static void pass_arg2(MacroAssembler* masm, Register arg) {
6463   if (c_rarg2 != arg ) {
6464     masm->mov(c_rarg2, arg);
6465   }
6466 }
6467 
6468 static void pass_arg3(MacroAssembler* masm, Register arg) {
6469   if (c_rarg3 != arg ) {
6470     masm->mov(c_rarg3, arg);
6471   }
6472 }
6473 
6474 void MacroAssembler::stop(const char* msg) {
6475   address rip = pc();
6476   pusha(); // get regs on stack
6477   lea(c_rarg0, ExternalAddress((address) msg));
6478   lea(c_rarg1, InternalAddress(rip));
6479   movq(c_rarg2, rsp); // pass pointer to regs array
6480   andq(rsp, -16); // align stack as required by ABI
6481   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
6482   hlt();
6483 }
6484 
6485 void MacroAssembler::warn(const char* msg) {
6486   push(rbp);
6487   movq(rbp, rsp);
6488   andq(rsp, -16);     // align stack as required by push_CPU_state and call
6489   push_CPU_state();   // keeps alignment at 16 bytes
6490   lea(c_rarg0, ExternalAddress((address) msg));
6491   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
6492   pop_CPU_state();
6493   mov(rsp, rbp);
6494   pop(rbp);
6495 }
6496 
6497 void MacroAssembler::print_state() {
6498   address rip = pc();
6499   pusha();            // get regs on stack
6500   push(rbp);
6501   movq(rbp, rsp);
6502   andq(rsp, -16);     // align stack as required by push_CPU_state and call
6503   push_CPU_state();   // keeps alignment at 16 bytes
6504 
6505   lea(c_rarg0, InternalAddress(rip));
6506   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
6507   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
6508 
6509   pop_CPU_state();
6510   mov(rsp, rbp);
6511   pop(rbp);
6512   popa();
6513 }
6514 
6515 #ifndef PRODUCT
6516 extern "C" void findpc(intptr_t x);
6517 #endif
6518 
6519 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
6520   // In order to get locks to work, we need to fake a in_VM state
6521   if (ShowMessageBoxOnError) {
6522     JavaThread* thread = JavaThread::current();
6523     JavaThreadState saved_state = thread->thread_state();
6524     thread->set_thread_state(_thread_in_vm);
6525 #ifndef PRODUCT
6526     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
6527       ttyLocker ttyl;
6528       BytecodeCounter::print();
6529     }
6530 #endif
6531     // To see where a verify_oop failed, get $ebx+40/X for this frame.
6532     // XXX correct this offset for amd64
6533     // This is the value of eip which points to where verify_oop will return.
6534     if (os::message_box(msg, "Execution stopped, print registers?")) {
6535       print_state64(pc, regs);
6536       BREAKPOINT;
6537       assert(false, "start up GDB");
6538     }
6539     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
6540   } else {
6541     ttyLocker ttyl;
6542     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
6543                     msg);
6544     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
6545   }
6546 }
6547 
6548 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
6549   ttyLocker ttyl;
6550   FlagSetting fs(Debugging, true);
6551   tty->print_cr("rip = 0x%016lx", pc);
6552 #ifndef PRODUCT
6553   tty->cr();
6554   findpc(pc);
6555   tty->cr();
6556 #endif
6557 #define PRINT_REG(rax, value) \
6558   { tty->print("%s = ", #rax); os::print_location(tty, value); }
6559   PRINT_REG(rax, regs[15]);
6560   PRINT_REG(rbx, regs[12]);
6561   PRINT_REG(rcx, regs[14]);
6562   PRINT_REG(rdx, regs[13]);
6563   PRINT_REG(rdi, regs[8]);
6564   PRINT_REG(rsi, regs[9]);
6565   PRINT_REG(rbp, regs[10]);
6566   PRINT_REG(rsp, regs[11]);
6567   PRINT_REG(r8 , regs[7]);
6568   PRINT_REG(r9 , regs[6]);
6569   PRINT_REG(r10, regs[5]);
6570   PRINT_REG(r11, regs[4]);
6571   PRINT_REG(r12, regs[3]);
6572   PRINT_REG(r13, regs[2]);
6573   PRINT_REG(r14, regs[1]);
6574   PRINT_REG(r15, regs[0]);
6575 #undef PRINT_REG
6576   // Print some words near top of staack.
6577   int64_t* rsp = (int64_t*) regs[11];
6578   int64_t* dump_sp = rsp;
6579   for (int col1 = 0; col1 < 8; col1++) {
6580     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
6581     os::print_location(tty, *dump_sp++);
6582   }
6583   for (int row = 0; row < 25; row++) {
6584     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
6585     for (int col = 0; col < 4; col++) {
6586       tty->print(" 0x%016lx", *dump_sp++);
6587     }
6588     tty->cr();
6589   }
6590   // Print some instructions around pc:
6591   Disassembler::decode((address)pc-64, (address)pc);
6592   tty->print_cr("--------");
6593   Disassembler::decode((address)pc, (address)pc+32);
6594 }
6595 
6596 #endif // _LP64
6597 
6598 // Now versions that are common to 32/64 bit
6599 
6600 void MacroAssembler::addptr(Register dst, int32_t imm32) {
6601   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
6602 }
6603 
6604 void MacroAssembler::addptr(Register dst, Register src) {
6605   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6606 }
6607 
6608 void MacroAssembler::addptr(Address dst, Register src) {
6609   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6610 }
6611 
6612 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
6613   if (reachable(src)) {
6614     Assembler::addsd(dst, as_Address(src));
6615   } else {
6616     lea(rscratch1, src);
6617     Assembler::addsd(dst, Address(rscratch1, 0));
6618   }
6619 }
6620 
6621 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
6622   if (reachable(src)) {
6623     addss(dst, as_Address(src));
6624   } else {
6625     lea(rscratch1, src);
6626     addss(dst, Address(rscratch1, 0));
6627   }
6628 }
6629 
6630 void MacroAssembler::align(int modulus) {
6631   if (offset() % modulus != 0) {
6632     nop(modulus - (offset() % modulus));
6633   }
6634 }
6635 
6636 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
6637   // Used in sign-masking with aligned address.
6638   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6639   if (reachable(src)) {
6640     Assembler::andpd(dst, as_Address(src));
6641   } else {
6642     lea(rscratch1, src);
6643     Assembler::andpd(dst, Address(rscratch1, 0));
6644   }
6645 }
6646 
6647 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
6648   // Used in sign-masking with aligned address.
6649   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6650   if (reachable(src)) {
6651     Assembler::andps(dst, as_Address(src));
6652   } else {
6653     lea(rscratch1, src);
6654     Assembler::andps(dst, Address(rscratch1, 0));
6655   }
6656 }
6657 
6658 void MacroAssembler::andptr(Register dst, int32_t imm32) {
6659   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
6660 }
6661 
6662 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
6663   pushf();
6664   if (os::is_MP())
6665     lock();
6666   incrementl(counter_addr);
6667   popf();
6668 }
6669 
6670 // Writes to stack successive pages until offset reached to check for
6671 // stack overflow + shadow pages.  This clobbers tmp.
6672 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
6673   movptr(tmp, rsp);
6674   // Bang stack for total size given plus shadow page size.
6675   // Bang one page at a time because large size can bang beyond yellow and
6676   // red zones.
6677   Label loop;
6678   bind(loop);
6679   movl(Address(tmp, (-os::vm_page_size())), size );
6680   subptr(tmp, os::vm_page_size());
6681   subl(size, os::vm_page_size());
6682   jcc(Assembler::greater, loop);
6683 
6684   // Bang down shadow pages too.
6685   // The -1 because we already subtracted 1 page.
6686   for (int i = 0; i< StackShadowPages-1; i++) {
6687     // this could be any sized move but this is can be a debugging crumb
6688     // so the bigger the better.
6689     movptr(Address(tmp, (-i*os::vm_page_size())), size );
6690   }
6691 }
6692 
6693 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
6694   assert(UseBiasedLocking, "why call this otherwise?");
6695 
6696   // Check for biased locking unlock case, which is a no-op
6697   // Note: we do not have to check the thread ID for two reasons.
6698   // First, the interpreter checks for IllegalMonitorStateException at
6699   // a higher level. Second, if the bias was revoked while we held the
6700   // lock, the object could not be rebiased toward another thread, so
6701   // the bias bit would be clear.
6702   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
6703   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
6704   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
6705   jcc(Assembler::equal, done);
6706 }
6707 
6708 void MacroAssembler::c2bool(Register x) {
6709   // implements x == 0 ? 0 : 1
6710   // note: must only look at least-significant byte of x
6711   //       since C-style booleans are stored in one byte
6712   //       only! (was bug)
6713   andl(x, 0xFF);
6714   setb(Assembler::notZero, x);
6715 }
6716 
6717 // Wouldn't need if AddressLiteral version had new name
6718 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
6719   Assembler::call(L, rtype);
6720 }
6721 
6722 void MacroAssembler::call(Register entry) {
6723   Assembler::call(entry);
6724 }
6725 
6726 void MacroAssembler::call(AddressLiteral entry) {
6727   if (reachable(entry)) {
6728     Assembler::call_literal(entry.target(), entry.rspec());
6729   } else {
6730     lea(rscratch1, entry);
6731     Assembler::call(rscratch1);
6732   }
6733 }
6734 
6735 void MacroAssembler::ic_call(address entry) {
6736   RelocationHolder rh = virtual_call_Relocation::spec(pc());
6737   movptr(rax, (intptr_t)Universe::non_oop_word());
6738   call(AddressLiteral(entry, rh));
6739 }
6740 
6741 // Implementation of call_VM versions
6742 
6743 void MacroAssembler::call_VM(Register oop_result,
6744                              address entry_point,
6745                              bool check_exceptions) {
6746   Label C, E;
6747   call(C, relocInfo::none);
6748   jmp(E);
6749 
6750   bind(C);
6751   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
6752   ret(0);
6753 
6754   bind(E);
6755 }
6756 
6757 void MacroAssembler::call_VM(Register oop_result,
6758                              address entry_point,
6759                              Register arg_1,
6760                              bool check_exceptions) {
6761   Label C, E;
6762   call(C, relocInfo::none);
6763   jmp(E);
6764 
6765   bind(C);
6766   pass_arg1(this, arg_1);
6767   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
6768   ret(0);
6769 
6770   bind(E);
6771 }
6772 
6773 void MacroAssembler::call_VM(Register oop_result,
6774                              address entry_point,
6775                              Register arg_1,
6776                              Register arg_2,
6777                              bool check_exceptions) {
6778   Label C, E;
6779   call(C, relocInfo::none);
6780   jmp(E);
6781 
6782   bind(C);
6783 
6784   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6785 
6786   pass_arg2(this, arg_2);
6787   pass_arg1(this, arg_1);
6788   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
6789   ret(0);
6790 
6791   bind(E);
6792 }
6793 
6794 void MacroAssembler::call_VM(Register oop_result,
6795                              address entry_point,
6796                              Register arg_1,
6797                              Register arg_2,
6798                              Register arg_3,
6799                              bool check_exceptions) {
6800   Label C, E;
6801   call(C, relocInfo::none);
6802   jmp(E);
6803 
6804   bind(C);
6805 
6806   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6807   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6808   pass_arg3(this, arg_3);
6809 
6810   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6811   pass_arg2(this, arg_2);
6812 
6813   pass_arg1(this, arg_1);
6814   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
6815   ret(0);
6816 
6817   bind(E);
6818 }
6819 
6820 void MacroAssembler::call_VM(Register oop_result,
6821                              Register last_java_sp,
6822                              address entry_point,
6823                              int number_of_arguments,
6824                              bool check_exceptions) {
6825   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6826   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6827 }
6828 
6829 void MacroAssembler::call_VM(Register oop_result,
6830                              Register last_java_sp,
6831                              address entry_point,
6832                              Register arg_1,
6833                              bool check_exceptions) {
6834   pass_arg1(this, arg_1);
6835   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6836 }
6837 
6838 void MacroAssembler::call_VM(Register oop_result,
6839                              Register last_java_sp,
6840                              address entry_point,
6841                              Register arg_1,
6842                              Register arg_2,
6843                              bool check_exceptions) {
6844 
6845   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6846   pass_arg2(this, arg_2);
6847   pass_arg1(this, arg_1);
6848   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6849 }
6850 
6851 void MacroAssembler::call_VM(Register oop_result,
6852                              Register last_java_sp,
6853                              address entry_point,
6854                              Register arg_1,
6855                              Register arg_2,
6856                              Register arg_3,
6857                              bool check_exceptions) {
6858   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6859   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6860   pass_arg3(this, arg_3);
6861   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6862   pass_arg2(this, arg_2);
6863   pass_arg1(this, arg_1);
6864   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6865 }
6866 
6867 void MacroAssembler::super_call_VM(Register oop_result,
6868                                    Register last_java_sp,
6869                                    address entry_point,
6870                                    int number_of_arguments,
6871                                    bool check_exceptions) {
6872   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6873   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6874 }
6875 
6876 void MacroAssembler::super_call_VM(Register oop_result,
6877                                    Register last_java_sp,
6878                                    address entry_point,
6879                                    Register arg_1,
6880                                    bool check_exceptions) {
6881   pass_arg1(this, arg_1);
6882   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6883 }
6884 
6885 void MacroAssembler::super_call_VM(Register oop_result,
6886                                    Register last_java_sp,
6887                                    address entry_point,
6888                                    Register arg_1,
6889                                    Register arg_2,
6890                                    bool check_exceptions) {
6891 
6892   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6893   pass_arg2(this, arg_2);
6894   pass_arg1(this, arg_1);
6895   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6896 }
6897 
6898 void MacroAssembler::super_call_VM(Register oop_result,
6899                                    Register last_java_sp,
6900                                    address entry_point,
6901                                    Register arg_1,
6902                                    Register arg_2,
6903                                    Register arg_3,
6904                                    bool check_exceptions) {
6905   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6906   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6907   pass_arg3(this, arg_3);
6908   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6909   pass_arg2(this, arg_2);
6910   pass_arg1(this, arg_1);
6911   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6912 }
6913 
6914 void MacroAssembler::call_VM_base(Register oop_result,
6915                                   Register java_thread,
6916                                   Register last_java_sp,
6917                                   address  entry_point,
6918                                   int      number_of_arguments,
6919                                   bool     check_exceptions) {
6920   // determine java_thread register
6921   if (!java_thread->is_valid()) {
6922 #ifdef _LP64
6923     java_thread = r15_thread;
6924 #else
6925     java_thread = rdi;
6926     get_thread(java_thread);
6927 #endif // LP64
6928   }
6929   // determine last_java_sp register
6930   if (!last_java_sp->is_valid()) {
6931     last_java_sp = rsp;
6932   }
6933   // debugging support
6934   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
6935   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
6936 #ifdef ASSERT
6937   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
6938   // r12 is the heapbase.
6939   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base");)
6940 #endif // ASSERT
6941 
6942   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
6943   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
6944 
6945   // push java thread (becomes first argument of C function)
6946 
6947   NOT_LP64(push(java_thread); number_of_arguments++);
6948   LP64_ONLY(mov(c_rarg0, r15_thread));
6949 
6950   // set last Java frame before call
6951   assert(last_java_sp != rbp, "can't use ebp/rbp");
6952 
6953   // Only interpreter should have to set fp
6954   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
6955 
6956   // do the call, remove parameters
6957   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
6958 
6959   // restore the thread (cannot use the pushed argument since arguments
6960   // may be overwritten by C code generated by an optimizing compiler);
6961   // however can use the register value directly if it is callee saved.
6962   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
6963     // rdi & rsi (also r15) are callee saved -> nothing to do
6964 #ifdef ASSERT
6965     guarantee(java_thread != rax, "change this code");
6966     push(rax);
6967     { Label L;
6968       get_thread(rax);
6969       cmpptr(java_thread, rax);
6970       jcc(Assembler::equal, L);
6971       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
6972       bind(L);
6973     }
6974     pop(rax);
6975 #endif
6976   } else {
6977     get_thread(java_thread);
6978   }
6979   // reset last Java frame
6980   // Only interpreter should have to clear fp
6981   reset_last_Java_frame(java_thread, true, false);
6982 
6983 #ifndef CC_INTERP
6984    // C++ interp handles this in the interpreter
6985   check_and_handle_popframe(java_thread);
6986   check_and_handle_earlyret(java_thread);
6987 #endif /* CC_INTERP */
6988 
6989   if (check_exceptions) {
6990     // check for pending exceptions (java_thread is set upon return)
6991     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6992 #ifndef _LP64
6993     jump_cc(Assembler::notEqual,
6994             RuntimeAddress(StubRoutines::forward_exception_entry()));
6995 #else
6996     // This used to conditionally jump to forward_exception however it is
6997     // possible if we relocate that the branch will not reach. So we must jump
6998     // around so we can always reach
6999 
7000     Label ok;
7001     jcc(Assembler::equal, ok);
7002     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7003     bind(ok);
7004 #endif // LP64
7005   }
7006 
7007   // get oop result if there is one and reset the value in the thread
7008   if (oop_result->is_valid()) {
7009     get_vm_result(oop_result, java_thread);
7010   }
7011 }
7012 
7013 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
7014 
7015   // Calculate the value for last_Java_sp
7016   // somewhat subtle. call_VM does an intermediate call
7017   // which places a return address on the stack just under the
7018   // stack pointer as the user finsihed with it. This allows
7019   // use to retrieve last_Java_pc from last_Java_sp[-1].
7020   // On 32bit we then have to push additional args on the stack to accomplish
7021   // the actual requested call. On 64bit call_VM only can use register args
7022   // so the only extra space is the return address that call_VM created.
7023   // This hopefully explains the calculations here.
7024 
7025 #ifdef _LP64
7026   // We've pushed one address, correct last_Java_sp
7027   lea(rax, Address(rsp, wordSize));
7028 #else
7029   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
7030 #endif // LP64
7031 
7032   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
7033 
7034 }
7035 
7036 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
7037   call_VM_leaf_base(entry_point, number_of_arguments);
7038 }
7039 
7040 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
7041   pass_arg0(this, arg_0);
7042   call_VM_leaf(entry_point, 1);
7043 }
7044 
7045 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
7046 
7047   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7048   pass_arg1(this, arg_1);
7049   pass_arg0(this, arg_0);
7050   call_VM_leaf(entry_point, 2);
7051 }
7052 
7053 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
7054   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
7055   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
7056   pass_arg2(this, arg_2);
7057   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7058   pass_arg1(this, arg_1);
7059   pass_arg0(this, arg_0);
7060   call_VM_leaf(entry_point, 3);
7061 }
7062 
7063 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
7064   pass_arg0(this, arg_0);
7065   MacroAssembler::call_VM_leaf_base(entry_point, 1);
7066 }
7067 
7068 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
7069 
7070   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7071   pass_arg1(this, arg_1);
7072   pass_arg0(this, arg_0);
7073   MacroAssembler::call_VM_leaf_base(entry_point, 2);
7074 }
7075 
7076 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
7077   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
7078   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
7079   pass_arg2(this, arg_2);
7080   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7081   pass_arg1(this, arg_1);
7082   pass_arg0(this, arg_0);
7083   MacroAssembler::call_VM_leaf_base(entry_point, 3);
7084 }
7085 
7086 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
7087   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
7088   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
7089   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
7090   pass_arg3(this, arg_3);
7091   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
7092   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
7093   pass_arg2(this, arg_2);
7094   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
7095   pass_arg1(this, arg_1);
7096   pass_arg0(this, arg_0);
7097   MacroAssembler::call_VM_leaf_base(entry_point, 4);
7098 }
7099 
7100 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
7101   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
7102   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
7103   verify_oop(oop_result, "broken oop in call_VM_base");
7104 }
7105 
7106 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
7107   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
7108   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
7109 }
7110 
7111 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
7112 }
7113 
7114 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
7115 }
7116 
7117 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
7118   if (reachable(src1)) {
7119     cmpl(as_Address(src1), imm);
7120   } else {
7121     lea(rscratch1, src1);
7122     cmpl(Address(rscratch1, 0), imm);
7123   }
7124 }
7125 
7126 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
7127   assert(!src2.is_lval(), "use cmpptr");
7128   if (reachable(src2)) {
7129     cmpl(src1, as_Address(src2));
7130   } else {
7131     lea(rscratch1, src2);
7132     cmpl(src1, Address(rscratch1, 0));
7133   }
7134 }
7135 
7136 void MacroAssembler::cmp32(Register src1, int32_t imm) {
7137   Assembler::cmpl(src1, imm);
7138 }
7139 
7140 void MacroAssembler::cmp32(Register src1, Address src2) {
7141   Assembler::cmpl(src1, src2);
7142 }
7143 
7144 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
7145   ucomisd(opr1, opr2);
7146 
7147   Label L;
7148   if (unordered_is_less) {
7149     movl(dst, -1);
7150     jcc(Assembler::parity, L);
7151     jcc(Assembler::below , L);
7152     movl(dst, 0);
7153     jcc(Assembler::equal , L);
7154     increment(dst);
7155   } else { // unordered is greater
7156     movl(dst, 1);
7157     jcc(Assembler::parity, L);
7158     jcc(Assembler::above , L);
7159     movl(dst, 0);
7160     jcc(Assembler::equal , L);
7161     decrementl(dst);
7162   }
7163   bind(L);
7164 }
7165 
7166 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
7167   ucomiss(opr1, opr2);
7168 
7169   Label L;
7170   if (unordered_is_less) {
7171     movl(dst, -1);
7172     jcc(Assembler::parity, L);
7173     jcc(Assembler::below , L);
7174     movl(dst, 0);
7175     jcc(Assembler::equal , L);
7176     increment(dst);
7177   } else { // unordered is greater
7178     movl(dst, 1);
7179     jcc(Assembler::parity, L);
7180     jcc(Assembler::above , L);
7181     movl(dst, 0);
7182     jcc(Assembler::equal , L);
7183     decrementl(dst);
7184   }
7185   bind(L);
7186 }
7187 
7188 
7189 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
7190   if (reachable(src1)) {
7191     cmpb(as_Address(src1), imm);
7192   } else {
7193     lea(rscratch1, src1);
7194     cmpb(Address(rscratch1, 0), imm);
7195   }
7196 }
7197 
7198 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
7199 #ifdef _LP64
7200   if (src2.is_lval()) {
7201     movptr(rscratch1, src2);
7202     Assembler::cmpq(src1, rscratch1);
7203   } else if (reachable(src2)) {
7204     cmpq(src1, as_Address(src2));
7205   } else {
7206     lea(rscratch1, src2);
7207     Assembler::cmpq(src1, Address(rscratch1, 0));
7208   }
7209 #else
7210   if (src2.is_lval()) {
7211     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
7212   } else {
7213     cmpl(src1, as_Address(src2));
7214   }
7215 #endif // _LP64
7216 }
7217 
7218 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
7219   assert(src2.is_lval(), "not a mem-mem compare");
7220 #ifdef _LP64
7221   // moves src2's literal address
7222   movptr(rscratch1, src2);
7223   Assembler::cmpq(src1, rscratch1);
7224 #else
7225   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
7226 #endif // _LP64
7227 }
7228 
7229 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
7230   if (reachable(adr)) {
7231     if (os::is_MP())
7232       lock();
7233     cmpxchgptr(reg, as_Address(adr));
7234   } else {
7235     lea(rscratch1, adr);
7236     if (os::is_MP())
7237       lock();
7238     cmpxchgptr(reg, Address(rscratch1, 0));
7239   }
7240 }
7241 
7242 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
7243   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
7244 }
7245 
7246 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
7247   if (reachable(src)) {
7248     Assembler::comisd(dst, as_Address(src));
7249   } else {
7250     lea(rscratch1, src);
7251     Assembler::comisd(dst, Address(rscratch1, 0));
7252   }
7253 }
7254 
7255 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
7256   if (reachable(src)) {
7257     Assembler::comiss(dst, as_Address(src));
7258   } else {
7259     lea(rscratch1, src);
7260     Assembler::comiss(dst, Address(rscratch1, 0));
7261   }
7262 }
7263 
7264 
7265 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
7266   Condition negated_cond = negate_condition(cond);
7267   Label L;
7268   jcc(negated_cond, L);
7269   atomic_incl(counter_addr);
7270   bind(L);
7271 }
7272 
7273 int MacroAssembler::corrected_idivl(Register reg) {
7274   // Full implementation of Java idiv and irem; checks for
7275   // special case as described in JVM spec., p.243 & p.271.
7276   // The function returns the (pc) offset of the idivl
7277   // instruction - may be needed for implicit exceptions.
7278   //
7279   //         normal case                           special case
7280   //
7281   // input : rax,: dividend                         min_int
7282   //         reg: divisor   (may not be rax,/rdx)   -1
7283   //
7284   // output: rax,: quotient  (= rax, idiv reg)       min_int
7285   //         rdx: remainder (= rax, irem reg)       0
7286   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
7287   const int min_int = 0x80000000;
7288   Label normal_case, special_case;
7289 
7290   // check for special case
7291   cmpl(rax, min_int);
7292   jcc(Assembler::notEqual, normal_case);
7293   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
7294   cmpl(reg, -1);
7295   jcc(Assembler::equal, special_case);
7296 
7297   // handle normal case
7298   bind(normal_case);
7299   cdql();
7300   int idivl_offset = offset();
7301   idivl(reg);
7302 
7303   // normal and special case exit
7304   bind(special_case);
7305 
7306   return idivl_offset;
7307 }
7308 
7309 
7310 
7311 void MacroAssembler::decrementl(Register reg, int value) {
7312   if (value == min_jint) {subl(reg, value) ; return; }
7313   if (value <  0) { incrementl(reg, -value); return; }
7314   if (value == 0) {                        ; return; }
7315   if (value == 1 && UseIncDec) { decl(reg) ; return; }
7316   /* else */      { subl(reg, value)       ; return; }
7317 }
7318 
7319 void MacroAssembler::decrementl(Address dst, int value) {
7320   if (value == min_jint) {subl(dst, value) ; return; }
7321   if (value <  0) { incrementl(dst, -value); return; }
7322   if (value == 0) {                        ; return; }
7323   if (value == 1 && UseIncDec) { decl(dst) ; return; }
7324   /* else */      { subl(dst, value)       ; return; }
7325 }
7326 
7327 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
7328   assert (shift_value > 0, "illegal shift value");
7329   Label _is_positive;
7330   testl (reg, reg);
7331   jcc (Assembler::positive, _is_positive);
7332   int offset = (1 << shift_value) - 1 ;
7333 
7334   if (offset == 1) {
7335     incrementl(reg);
7336   } else {
7337     addl(reg, offset);
7338   }
7339 
7340   bind (_is_positive);
7341   sarl(reg, shift_value);
7342 }
7343 
7344 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
7345   if (reachable(src)) {
7346     Assembler::divsd(dst, as_Address(src));
7347   } else {
7348     lea(rscratch1, src);
7349     Assembler::divsd(dst, Address(rscratch1, 0));
7350   }
7351 }
7352 
7353 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
7354   if (reachable(src)) {
7355     Assembler::divss(dst, as_Address(src));
7356   } else {
7357     lea(rscratch1, src);
7358     Assembler::divss(dst, Address(rscratch1, 0));
7359   }
7360 }
7361 
7362 // !defined(COMPILER2) is because of stupid core builds
7363 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
7364 void MacroAssembler::empty_FPU_stack() {
7365   if (VM_Version::supports_mmx()) {
7366     emms();
7367   } else {
7368     for (int i = 8; i-- > 0; ) ffree(i);
7369   }
7370 }
7371 #endif // !LP64 || C1 || !C2
7372 
7373 
7374 // Defines obj, preserves var_size_in_bytes
7375 void MacroAssembler::eden_allocate(Register obj,
7376                                    Register var_size_in_bytes,
7377                                    int con_size_in_bytes,
7378                                    Register t1,
7379                                    Label& slow_case) {
7380   assert(obj == rax, "obj must be in rax, for cmpxchg");
7381   assert_different_registers(obj, var_size_in_bytes, t1);
7382   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
7383     jmp(slow_case);
7384   } else {
7385     Register end = t1;
7386     Label retry;
7387     bind(retry);
7388     ExternalAddress heap_top((address) Universe::heap()->top_addr());
7389     movptr(obj, heap_top);
7390     if (var_size_in_bytes == noreg) {
7391       lea(end, Address(obj, con_size_in_bytes));
7392     } else {
7393       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
7394     }
7395     // if end < obj then we wrapped around => object too long => slow case
7396     cmpptr(end, obj);
7397     jcc(Assembler::below, slow_case);
7398     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
7399     jcc(Assembler::above, slow_case);
7400     // Compare obj with the top addr, and if still equal, store the new top addr in
7401     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
7402     // it otherwise. Use lock prefix for atomicity on MPs.
7403     locked_cmpxchgptr(end, heap_top);
7404     jcc(Assembler::notEqual, retry);
7405   }
7406 }
7407 
7408 void MacroAssembler::enter() {
7409   push(rbp);
7410   mov(rbp, rsp);
7411 }
7412 
7413 // A 5 byte nop that is safe for patching (see patch_verified_entry)
7414 void MacroAssembler::fat_nop() {
7415   if (UseAddressNop) {
7416     addr_nop_5();
7417   } else {
7418     emit_byte(0x26); // es:
7419     emit_byte(0x2e); // cs:
7420     emit_byte(0x64); // fs:
7421     emit_byte(0x65); // gs:
7422     emit_byte(0x90);
7423   }
7424 }
7425 
7426 void MacroAssembler::fcmp(Register tmp) {
7427   fcmp(tmp, 1, true, true);
7428 }
7429 
7430 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
7431   assert(!pop_right || pop_left, "usage error");
7432   if (VM_Version::supports_cmov()) {
7433     assert(tmp == noreg, "unneeded temp");
7434     if (pop_left) {
7435       fucomip(index);
7436     } else {
7437       fucomi(index);
7438     }
7439     if (pop_right) {
7440       fpop();
7441     }
7442   } else {
7443     assert(tmp != noreg, "need temp");
7444     if (pop_left) {
7445       if (pop_right) {
7446         fcompp();
7447       } else {
7448         fcomp(index);
7449       }
7450     } else {
7451       fcom(index);
7452     }
7453     // convert FPU condition into eflags condition via rax,
7454     save_rax(tmp);
7455     fwait(); fnstsw_ax();
7456     sahf();
7457     restore_rax(tmp);
7458   }
7459   // condition codes set as follows:
7460   //
7461   // CF (corresponds to C0) if x < y
7462   // PF (corresponds to C2) if unordered
7463   // ZF (corresponds to C3) if x = y
7464 }
7465 
7466 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
7467   fcmp2int(dst, unordered_is_less, 1, true, true);
7468 }
7469 
7470 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
7471   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
7472   Label L;
7473   if (unordered_is_less) {
7474     movl(dst, -1);
7475     jcc(Assembler::parity, L);
7476     jcc(Assembler::below , L);
7477     movl(dst, 0);
7478     jcc(Assembler::equal , L);
7479     increment(dst);
7480   } else { // unordered is greater
7481     movl(dst, 1);
7482     jcc(Assembler::parity, L);
7483     jcc(Assembler::above , L);
7484     movl(dst, 0);
7485     jcc(Assembler::equal , L);
7486     decrementl(dst);
7487   }
7488   bind(L);
7489 }
7490 
7491 void MacroAssembler::fld_d(AddressLiteral src) {
7492   fld_d(as_Address(src));
7493 }
7494 
7495 void MacroAssembler::fld_s(AddressLiteral src) {
7496   fld_s(as_Address(src));
7497 }
7498 
7499 void MacroAssembler::fld_x(AddressLiteral src) {
7500   Assembler::fld_x(as_Address(src));
7501 }
7502 
7503 void MacroAssembler::fldcw(AddressLiteral src) {
7504   Assembler::fldcw(as_Address(src));
7505 }
7506 
7507 void MacroAssembler::pow_exp_core_encoding() {
7508   // kills rax, rcx, rdx
7509   subptr(rsp,sizeof(jdouble));
7510   // computes 2^X. Stack: X ...
7511   // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
7512   // keep it on the thread's stack to compute 2^int(X) later
7513   // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
7514   // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
7515   fld_s(0);                 // Stack: X X ...
7516   frndint();                // Stack: int(X) X ...
7517   fsuba(1);                 // Stack: int(X) X-int(X) ...
7518   fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
7519   f2xm1();                  // Stack: 2^(X-int(X))-1 ...
7520   fld1();                   // Stack: 1 2^(X-int(X))-1 ...
7521   faddp(1);                 // Stack: 2^(X-int(X))
7522   // computes 2^(int(X)): add exponent bias (1023) to int(X), then
7523   // shift int(X)+1023 to exponent position.
7524   // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
7525   // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
7526   // values so detect them and set result to NaN.
7527   movl(rax,Address(rsp,0));
7528   movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
7529   addl(rax, 1023);
7530   movl(rdx,rax);
7531   shll(rax,20);
7532   // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
7533   addl(rdx,1);
7534   // Check that 1 < int(X)+1023+1 < 2048
7535   // in 3 steps:
7536   // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
7537   // 2- (int(X)+1023+1)&-2048 != 0
7538   // 3- (int(X)+1023+1)&-2048 != 1
7539   // Do 2- first because addl just updated the flags.
7540   cmov32(Assembler::equal,rax,rcx);
7541   cmpl(rdx,1);
7542   cmov32(Assembler::equal,rax,rcx);
7543   testl(rdx,rcx);
7544   cmov32(Assembler::notEqual,rax,rcx);
7545   movl(Address(rsp,4),rax);
7546   movl(Address(rsp,0),0);
7547   fmul_d(Address(rsp,0));   // Stack: 2^X ...
7548   addptr(rsp,sizeof(jdouble));
7549 }
7550 
7551 void MacroAssembler::increase_precision() {
7552   subptr(rsp, BytesPerWord);
7553   fnstcw(Address(rsp, 0));
7554   movl(rax, Address(rsp, 0));
7555   orl(rax, 0x300);
7556   push(rax);
7557   fldcw(Address(rsp, 0));
7558   pop(rax);
7559 }
7560 
7561 void MacroAssembler::restore_precision() {
7562   fldcw(Address(rsp, 0));
7563   addptr(rsp, BytesPerWord);
7564 }
7565 
7566 void MacroAssembler::fast_pow() {
7567   // computes X^Y = 2^(Y * log2(X))
7568   // if fast computation is not possible, result is NaN. Requires
7569   // fallback from user of this macro.
7570   // increase precision for intermediate steps of the computation
7571   increase_precision();
7572   fyl2x();                 // Stack: (Y*log2(X)) ...
7573   pow_exp_core_encoding(); // Stack: exp(X) ...
7574   restore_precision();
7575 }
7576 
7577 void MacroAssembler::fast_exp() {
7578   // computes exp(X) = 2^(X * log2(e))
7579   // if fast computation is not possible, result is NaN. Requires
7580   // fallback from user of this macro.
7581   // increase precision for intermediate steps of the computation
7582   increase_precision();
7583   fldl2e();                // Stack: log2(e) X ...
7584   fmulp(1);                // Stack: (X*log2(e)) ...
7585   pow_exp_core_encoding(); // Stack: exp(X) ...
7586   restore_precision();
7587 }
7588 
7589 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
7590   // kills rax, rcx, rdx
7591   // pow and exp needs 2 extra registers on the fpu stack.
7592   Label slow_case, done;
7593   Register tmp = noreg;
7594   if (!VM_Version::supports_cmov()) {
7595     // fcmp needs a temporary so preserve rdx,
7596     tmp = rdx;
7597   }
7598   Register tmp2 = rax;
7599   Register tmp3 = rcx;
7600 
7601   if (is_exp) {
7602     // Stack: X
7603     fld_s(0);                   // duplicate argument for runtime call. Stack: X X
7604     fast_exp();                 // Stack: exp(X) X
7605     fcmp(tmp, 0, false, false); // Stack: exp(X) X
7606     // exp(X) not equal to itself: exp(X) is NaN go to slow case.
7607     jcc(Assembler::parity, slow_case);
7608     // get rid of duplicate argument. Stack: exp(X)
7609     if (num_fpu_regs_in_use > 0) {
7610       fxch();
7611       fpop();
7612     } else {
7613       ffree(1);
7614     }
7615     jmp(done);
7616   } else {
7617     // Stack: X Y
7618     Label x_negative, y_odd;
7619 
7620     fldz();                     // Stack: 0 X Y
7621     fcmp(tmp, 1, true, false);  // Stack: X Y
7622     jcc(Assembler::above, x_negative);
7623 
7624     // X >= 0
7625 
7626     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7627     fld_s(1);                   // Stack: X Y X Y
7628     fast_pow();                 // Stack: X^Y X Y
7629     fcmp(tmp, 0, false, false); // Stack: X^Y X Y
7630     // X^Y not equal to itself: X^Y is NaN go to slow case.
7631     jcc(Assembler::parity, slow_case);
7632     // get rid of duplicate arguments. Stack: X^Y
7633     if (num_fpu_regs_in_use > 0) {
7634       fxch(); fpop();
7635       fxch(); fpop();
7636     } else {
7637       ffree(2);
7638       ffree(1);
7639     }
7640     jmp(done);
7641 
7642     // X <= 0
7643     bind(x_negative);
7644 
7645     fld_s(1);                   // Stack: Y X Y
7646     frndint();                  // Stack: int(Y) X Y
7647     fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
7648     jcc(Assembler::notEqual, slow_case);
7649 
7650     subptr(rsp, 8);
7651 
7652     // For X^Y, when X < 0, Y has to be an integer and the final
7653     // result depends on whether it's odd or even. We just checked
7654     // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
7655     // integer to test its parity. If int(Y) is huge and doesn't fit
7656     // in the 64 bit integer range, the integer indefinite value will
7657     // end up in the gp registers. Huge numbers are all even, the
7658     // integer indefinite number is even so it's fine.
7659 
7660 #ifdef ASSERT
7661     // Let's check we don't end up with an integer indefinite number
7662     // when not expected. First test for huge numbers: check whether
7663     // int(Y)+1 == int(Y) which is true for very large numbers and
7664     // those are all even. A 64 bit integer is guaranteed to not
7665     // overflow for numbers where y+1 != y (when precision is set to
7666     // double precision).
7667     Label y_not_huge;
7668 
7669     fld1();                     // Stack: 1 int(Y) X Y
7670     fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
7671 
7672 #ifdef _LP64
7673     // trip to memory to force the precision down from double extended
7674     // precision
7675     fstp_d(Address(rsp, 0));
7676     fld_d(Address(rsp, 0));
7677 #endif
7678 
7679     fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
7680 #endif
7681 
7682     // move int(Y) as 64 bit integer to thread's stack
7683     fistp_d(Address(rsp,0));    // Stack: X Y
7684 
7685 #ifdef ASSERT
7686     jcc(Assembler::notEqual, y_not_huge);
7687 
7688     // Y is huge so we know it's even. It may not fit in a 64 bit
7689     // integer and we don't want the debug code below to see the
7690     // integer indefinite value so overwrite int(Y) on the thread's
7691     // stack with 0.
7692     movl(Address(rsp, 0), 0);
7693     movl(Address(rsp, 4), 0);
7694 
7695     bind(y_not_huge);
7696 #endif
7697 
7698     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7699     fld_s(1);                   // Stack: X Y X Y
7700     fabs();                     // Stack: abs(X) Y X Y
7701     fast_pow();                 // Stack: abs(X)^Y X Y
7702     fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
7703     // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
7704 
7705     pop(tmp2);
7706     NOT_LP64(pop(tmp3));
7707     jcc(Assembler::parity, slow_case);
7708 
7709 #ifdef ASSERT
7710     // Check that int(Y) is not integer indefinite value (int
7711     // overflow). Shouldn't happen because for values that would
7712     // overflow, 1+int(Y)==Y which was tested earlier.
7713 #ifndef _LP64
7714     {
7715       Label integer;
7716       testl(tmp2, tmp2);
7717       jcc(Assembler::notZero, integer);
7718       cmpl(tmp3, 0x80000000);
7719       jcc(Assembler::notZero, integer);
7720       STOP("integer indefinite value shouldn't be seen here");
7721       bind(integer);
7722     }
7723 #else
7724     {
7725       Label integer;
7726       mov(tmp3, tmp2); // preserve tmp2 for parity check below
7727       shlq(tmp3, 1);
7728       jcc(Assembler::carryClear, integer);
7729       jcc(Assembler::notZero, integer);
7730       STOP("integer indefinite value shouldn't be seen here");
7731       bind(integer);
7732     }
7733 #endif
7734 #endif
7735 
7736     // get rid of duplicate arguments. Stack: X^Y
7737     if (num_fpu_regs_in_use > 0) {
7738       fxch(); fpop();
7739       fxch(); fpop();
7740     } else {
7741       ffree(2);
7742       ffree(1);
7743     }
7744 
7745     testl(tmp2, 1);
7746     jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
7747     // X <= 0, Y even: X^Y = -abs(X)^Y
7748 
7749     fchs();                     // Stack: -abs(X)^Y Y
7750     jmp(done);
7751   }
7752 
7753   // slow case: runtime call
7754   bind(slow_case);
7755 
7756   fpop();                       // pop incorrect result or int(Y)
7757 
7758   fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
7759                       is_exp ? 1 : 2, num_fpu_regs_in_use);
7760 
7761   // Come here with result in F-TOS
7762   bind(done);
7763 }
7764 
7765 void MacroAssembler::fpop() {
7766   ffree();
7767   fincstp();
7768 }
7769 
7770 void MacroAssembler::fremr(Register tmp) {
7771   save_rax(tmp);
7772   { Label L;
7773     bind(L);
7774     fprem();
7775     fwait(); fnstsw_ax();
7776 #ifdef _LP64
7777     testl(rax, 0x400);
7778     jcc(Assembler::notEqual, L);
7779 #else
7780     sahf();
7781     jcc(Assembler::parity, L);
7782 #endif // _LP64
7783   }
7784   restore_rax(tmp);
7785   // Result is in ST0.
7786   // Note: fxch & fpop to get rid of ST1
7787   // (otherwise FPU stack could overflow eventually)
7788   fxch(1);
7789   fpop();
7790 }
7791 
7792 
7793 void MacroAssembler::incrementl(AddressLiteral dst) {
7794   if (reachable(dst)) {
7795     incrementl(as_Address(dst));
7796   } else {
7797     lea(rscratch1, dst);
7798     incrementl(Address(rscratch1, 0));
7799   }
7800 }
7801 
7802 void MacroAssembler::incrementl(ArrayAddress dst) {
7803   incrementl(as_Address(dst));
7804 }
7805 
7806 void MacroAssembler::incrementl(Register reg, int value) {
7807   if (value == min_jint) {addl(reg, value) ; return; }
7808   if (value <  0) { decrementl(reg, -value); return; }
7809   if (value == 0) {                        ; return; }
7810   if (value == 1 && UseIncDec) { incl(reg) ; return; }
7811   /* else */      { addl(reg, value)       ; return; }
7812 }
7813 
7814 void MacroAssembler::incrementl(Address dst, int value) {
7815   if (value == min_jint) {addl(dst, value) ; return; }
7816   if (value <  0) { decrementl(dst, -value); return; }
7817   if (value == 0) {                        ; return; }
7818   if (value == 1 && UseIncDec) { incl(dst) ; return; }
7819   /* else */      { addl(dst, value)       ; return; }
7820 }
7821 
7822 void MacroAssembler::jump(AddressLiteral dst) {
7823   if (reachable(dst)) {
7824     jmp_literal(dst.target(), dst.rspec());
7825   } else {
7826     lea(rscratch1, dst);
7827     jmp(rscratch1);
7828   }
7829 }
7830 
7831 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
7832   if (reachable(dst)) {
7833     InstructionMark im(this);
7834     relocate(dst.reloc());
7835     const int short_size = 2;
7836     const int long_size = 6;
7837     int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
7838     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
7839       // 0111 tttn #8-bit disp
7840       emit_byte(0x70 | cc);
7841       emit_byte((offs - short_size) & 0xFF);
7842     } else {
7843       // 0000 1111 1000 tttn #32-bit disp
7844       emit_byte(0x0F);
7845       emit_byte(0x80 | cc);
7846       emit_long(offs - long_size);
7847     }
7848   } else {
7849 #ifdef ASSERT
7850     warning("reversing conditional branch");
7851 #endif /* ASSERT */
7852     Label skip;
7853     jccb(reverse[cc], skip);
7854     lea(rscratch1, dst);
7855     Assembler::jmp(rscratch1);
7856     bind(skip);
7857   }
7858 }
7859 
7860 void MacroAssembler::ldmxcsr(AddressLiteral src) {
7861   if (reachable(src)) {
7862     Assembler::ldmxcsr(as_Address(src));
7863   } else {
7864     lea(rscratch1, src);
7865     Assembler::ldmxcsr(Address(rscratch1, 0));
7866   }
7867 }
7868 
7869 int MacroAssembler::load_signed_byte(Register dst, Address src) {
7870   int off;
7871   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7872     off = offset();
7873     movsbl(dst, src); // movsxb
7874   } else {
7875     off = load_unsigned_byte(dst, src);
7876     shll(dst, 24);
7877     sarl(dst, 24);
7878   }
7879   return off;
7880 }
7881 
7882 // Note: load_signed_short used to be called load_signed_word.
7883 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
7884 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
7885 // The term "word" in HotSpot means a 32- or 64-bit machine word.
7886 int MacroAssembler::load_signed_short(Register dst, Address src) {
7887   int off;
7888   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7889     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
7890     // version but this is what 64bit has always done. This seems to imply
7891     // that users are only using 32bits worth.
7892     off = offset();
7893     movswl(dst, src); // movsxw
7894   } else {
7895     off = load_unsigned_short(dst, src);
7896     shll(dst, 16);
7897     sarl(dst, 16);
7898   }
7899   return off;
7900 }
7901 
7902 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
7903   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7904   // and "3.9 Partial Register Penalties", p. 22).
7905   int off;
7906   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
7907     off = offset();
7908     movzbl(dst, src); // movzxb
7909   } else {
7910     xorl(dst, dst);
7911     off = offset();
7912     movb(dst, src);
7913   }
7914   return off;
7915 }
7916 
7917 // Note: load_unsigned_short used to be called load_unsigned_word.
7918 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
7919   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7920   // and "3.9 Partial Register Penalties", p. 22).
7921   int off;
7922   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
7923     off = offset();
7924     movzwl(dst, src); // movzxw
7925   } else {
7926     xorl(dst, dst);
7927     off = offset();
7928     movw(dst, src);
7929   }
7930   return off;
7931 }
7932 
7933 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
7934   switch (size_in_bytes) {
7935 #ifndef _LP64
7936   case  8:
7937     assert(dst2 != noreg, "second dest register required");
7938     movl(dst,  src);
7939     movl(dst2, src.plus_disp(BytesPerInt));
7940     break;
7941 #else
7942   case  8:  movq(dst, src); break;
7943 #endif
7944   case  4:  movl(dst, src); break;
7945   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
7946   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
7947   default:  ShouldNotReachHere();
7948   }
7949 }
7950 
7951 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
7952   switch (size_in_bytes) {
7953 #ifndef _LP64
7954   case  8:
7955     assert(src2 != noreg, "second source register required");
7956     movl(dst,                        src);
7957     movl(dst.plus_disp(BytesPerInt), src2);
7958     break;
7959 #else
7960   case  8:  movq(dst, src); break;
7961 #endif
7962   case  4:  movl(dst, src); break;
7963   case  2:  movw(dst, src); break;
7964   case  1:  movb(dst, src); break;
7965   default:  ShouldNotReachHere();
7966   }
7967 }
7968 
7969 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
7970   if (reachable(dst)) {
7971     movl(as_Address(dst), src);
7972   } else {
7973     lea(rscratch1, dst);
7974     movl(Address(rscratch1, 0), src);
7975   }
7976 }
7977 
7978 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
7979   if (reachable(src)) {
7980     movl(dst, as_Address(src));
7981   } else {
7982     lea(rscratch1, src);
7983     movl(dst, Address(rscratch1, 0));
7984   }
7985 }
7986 
7987 // C++ bool manipulation
7988 
7989 void MacroAssembler::movbool(Register dst, Address src) {
7990   if(sizeof(bool) == 1)
7991     movb(dst, src);
7992   else if(sizeof(bool) == 2)
7993     movw(dst, src);
7994   else if(sizeof(bool) == 4)
7995     movl(dst, src);
7996   else
7997     // unsupported
7998     ShouldNotReachHere();
7999 }
8000 
8001 void MacroAssembler::movbool(Address dst, bool boolconst) {
8002   if(sizeof(bool) == 1)
8003     movb(dst, (int) boolconst);
8004   else if(sizeof(bool) == 2)
8005     movw(dst, (int) boolconst);
8006   else if(sizeof(bool) == 4)
8007     movl(dst, (int) boolconst);
8008   else
8009     // unsupported
8010     ShouldNotReachHere();
8011 }
8012 
8013 void MacroAssembler::movbool(Address dst, Register src) {
8014   if(sizeof(bool) == 1)
8015     movb(dst, src);
8016   else if(sizeof(bool) == 2)
8017     movw(dst, src);
8018   else if(sizeof(bool) == 4)
8019     movl(dst, src);
8020   else
8021     // unsupported
8022     ShouldNotReachHere();
8023 }
8024 
8025 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
8026   movb(as_Address(dst), src);
8027 }
8028 
8029 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
8030   if (reachable(src)) {
8031     movdl(dst, as_Address(src));
8032   } else {
8033     lea(rscratch1, src);
8034     movdl(dst, Address(rscratch1, 0));
8035   }
8036 }
8037 
8038 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
8039   if (reachable(src)) {
8040     movq(dst, as_Address(src));
8041   } else {
8042     lea(rscratch1, src);
8043     movq(dst, Address(rscratch1, 0));
8044   }
8045 }
8046 
8047 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
8048   if (reachable(src)) {
8049     if (UseXmmLoadAndClearUpper) {
8050       movsd (dst, as_Address(src));
8051     } else {
8052       movlpd(dst, as_Address(src));
8053     }
8054   } else {
8055     lea(rscratch1, src);
8056     if (UseXmmLoadAndClearUpper) {
8057       movsd (dst, Address(rscratch1, 0));
8058     } else {
8059       movlpd(dst, Address(rscratch1, 0));
8060     }
8061   }
8062 }
8063 
8064 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
8065   if (reachable(src)) {
8066     movss(dst, as_Address(src));
8067   } else {
8068     lea(rscratch1, src);
8069     movss(dst, Address(rscratch1, 0));
8070   }
8071 }
8072 
8073 void MacroAssembler::movptr(Register dst, Register src) {
8074   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
8075 }
8076 
8077 void MacroAssembler::movptr(Register dst, Address src) {
8078   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
8079 }
8080 
8081 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
8082 void MacroAssembler::movptr(Register dst, intptr_t src) {
8083   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
8084 }
8085 
8086 void MacroAssembler::movptr(Address dst, Register src) {
8087   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
8088 }
8089 
8090 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
8091   if (reachable(src)) {
8092     Assembler::movsd(dst, as_Address(src));
8093   } else {
8094     lea(rscratch1, src);
8095     Assembler::movsd(dst, Address(rscratch1, 0));
8096   }
8097 }
8098 
8099 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
8100   if (reachable(src)) {
8101     Assembler::movss(dst, as_Address(src));
8102   } else {
8103     lea(rscratch1, src);
8104     Assembler::movss(dst, Address(rscratch1, 0));
8105   }
8106 }
8107 
8108 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
8109   if (reachable(src)) {
8110     Assembler::mulsd(dst, as_Address(src));
8111   } else {
8112     lea(rscratch1, src);
8113     Assembler::mulsd(dst, Address(rscratch1, 0));
8114   }
8115 }
8116 
8117 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
8118   if (reachable(src)) {
8119     Assembler::mulss(dst, as_Address(src));
8120   } else {
8121     lea(rscratch1, src);
8122     Assembler::mulss(dst, Address(rscratch1, 0));
8123   }
8124 }
8125 
8126 void MacroAssembler::null_check(Register reg, int offset) {
8127   if (needs_explicit_null_check(offset)) {
8128     // provoke OS NULL exception if reg = NULL by
8129     // accessing M[reg] w/o changing any (non-CC) registers
8130     // NOTE: cmpl is plenty here to provoke a segv
8131     cmpptr(rax, Address(reg, 0));
8132     // Note: should probably use testl(rax, Address(reg, 0));
8133     //       may be shorter code (however, this version of
8134     //       testl needs to be implemented first)
8135   } else {
8136     // nothing to do, (later) access of M[reg + offset]
8137     // will provoke OS NULL exception if reg = NULL
8138   }
8139 }
8140 
8141 void MacroAssembler::os_breakpoint() {
8142   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
8143   // (e.g., MSVC can't call ps() otherwise)
8144   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
8145 }
8146 
8147 void MacroAssembler::pop_CPU_state() {
8148   pop_FPU_state();
8149   pop_IU_state();
8150 }
8151 
8152 void MacroAssembler::pop_FPU_state() {
8153   NOT_LP64(frstor(Address(rsp, 0));)
8154   LP64_ONLY(fxrstor(Address(rsp, 0));)
8155   addptr(rsp, FPUStateSizeInWords * wordSize);
8156 }
8157 
8158 void MacroAssembler::pop_IU_state() {
8159   popa();
8160   LP64_ONLY(addq(rsp, 8));
8161   popf();
8162 }
8163 
8164 // Save Integer and Float state
8165 // Warning: Stack must be 16 byte aligned (64bit)
8166 void MacroAssembler::push_CPU_state() {
8167   push_IU_state();
8168   push_FPU_state();
8169 }
8170 
8171 void MacroAssembler::push_FPU_state() {
8172   subptr(rsp, FPUStateSizeInWords * wordSize);
8173 #ifndef _LP64
8174   fnsave(Address(rsp, 0));
8175   fwait();
8176 #else
8177   fxsave(Address(rsp, 0));
8178 #endif // LP64
8179 }
8180 
8181 void MacroAssembler::push_IU_state() {
8182   // Push flags first because pusha kills them
8183   pushf();
8184   // Make sure rsp stays 16-byte aligned
8185   LP64_ONLY(subq(rsp, 8));
8186   pusha();
8187 }
8188 
8189 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
8190   // determine java_thread register
8191   if (!java_thread->is_valid()) {
8192     java_thread = rdi;
8193     get_thread(java_thread);
8194   }
8195   // we must set sp to zero to clear frame
8196   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
8197   if (clear_fp) {
8198     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
8199   }
8200 
8201   if (clear_pc)
8202     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
8203 
8204 }
8205 
8206 void MacroAssembler::restore_rax(Register tmp) {
8207   if (tmp == noreg) pop(rax);
8208   else if (tmp != rax) mov(rax, tmp);
8209 }
8210 
8211 void MacroAssembler::round_to(Register reg, int modulus) {
8212   addptr(reg, modulus - 1);
8213   andptr(reg, -modulus);
8214 }
8215 
8216 void MacroAssembler::save_rax(Register tmp) {
8217   if (tmp == noreg) push(rax);
8218   else if (tmp != rax) mov(tmp, rax);
8219 }
8220 
8221 // Write serialization page so VM thread can do a pseudo remote membar.
8222 // We use the current thread pointer to calculate a thread specific
8223 // offset to write to within the page. This minimizes bus traffic
8224 // due to cache line collision.
8225 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
8226   movl(tmp, thread);
8227   shrl(tmp, os::get_serialize_page_shift_count());
8228   andl(tmp, (os::vm_page_size() - sizeof(int)));
8229 
8230   Address index(noreg, tmp, Address::times_1);
8231   ExternalAddress page(os::get_memory_serialize_page());
8232 
8233   // Size of store must match masking code above
8234   movl(as_Address(ArrayAddress(page, index)), tmp);
8235 }
8236 
8237 // Calls to C land
8238 //
8239 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
8240 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
8241 // has to be reset to 0. This is required to allow proper stack traversal.
8242 void MacroAssembler::set_last_Java_frame(Register java_thread,
8243                                          Register last_java_sp,
8244                                          Register last_java_fp,
8245                                          address  last_java_pc) {
8246   // determine java_thread register
8247   if (!java_thread->is_valid()) {
8248     java_thread = rdi;
8249     get_thread(java_thread);
8250   }
8251   // determine last_java_sp register
8252   if (!last_java_sp->is_valid()) {
8253     last_java_sp = rsp;
8254   }
8255 
8256   // last_java_fp is optional
8257 
8258   if (last_java_fp->is_valid()) {
8259     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
8260   }
8261 
8262   // last_java_pc is optional
8263 
8264   if (last_java_pc != NULL) {
8265     lea(Address(java_thread,
8266                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
8267         InternalAddress(last_java_pc));
8268 
8269   }
8270   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
8271 }
8272 
8273 void MacroAssembler::shlptr(Register dst, int imm8) {
8274   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
8275 }
8276 
8277 void MacroAssembler::shrptr(Register dst, int imm8) {
8278   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
8279 }
8280 
8281 void MacroAssembler::sign_extend_byte(Register reg) {
8282   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
8283     movsbl(reg, reg); // movsxb
8284   } else {
8285     shll(reg, 24);
8286     sarl(reg, 24);
8287   }
8288 }
8289 
8290 void MacroAssembler::sign_extend_short(Register reg) {
8291   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
8292     movswl(reg, reg); // movsxw
8293   } else {
8294     shll(reg, 16);
8295     sarl(reg, 16);
8296   }
8297 }
8298 
8299 void MacroAssembler::testl(Register dst, AddressLiteral src) {
8300   assert(reachable(src), "Address should be reachable");
8301   testl(dst, as_Address(src));
8302 }
8303 
8304 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
8305   if (reachable(src)) {
8306     Assembler::sqrtsd(dst, as_Address(src));
8307   } else {
8308     lea(rscratch1, src);
8309     Assembler::sqrtsd(dst, Address(rscratch1, 0));
8310   }
8311 }
8312 
8313 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
8314   if (reachable(src)) {
8315     Assembler::sqrtss(dst, as_Address(src));
8316   } else {
8317     lea(rscratch1, src);
8318     Assembler::sqrtss(dst, Address(rscratch1, 0));
8319   }
8320 }
8321 
8322 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
8323   if (reachable(src)) {
8324     Assembler::subsd(dst, as_Address(src));
8325   } else {
8326     lea(rscratch1, src);
8327     Assembler::subsd(dst, Address(rscratch1, 0));
8328   }
8329 }
8330 
8331 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
8332   if (reachable(src)) {
8333     Assembler::subss(dst, as_Address(src));
8334   } else {
8335     lea(rscratch1, src);
8336     Assembler::subss(dst, Address(rscratch1, 0));
8337   }
8338 }
8339 
8340 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
8341   if (reachable(src)) {
8342     Assembler::ucomisd(dst, as_Address(src));
8343   } else {
8344     lea(rscratch1, src);
8345     Assembler::ucomisd(dst, Address(rscratch1, 0));
8346   }
8347 }
8348 
8349 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
8350   if (reachable(src)) {
8351     Assembler::ucomiss(dst, as_Address(src));
8352   } else {
8353     lea(rscratch1, src);
8354     Assembler::ucomiss(dst, Address(rscratch1, 0));
8355   }
8356 }
8357 
8358 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
8359   // Used in sign-bit flipping with aligned address.
8360   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
8361   if (reachable(src)) {
8362     Assembler::xorpd(dst, as_Address(src));
8363   } else {
8364     lea(rscratch1, src);
8365     Assembler::xorpd(dst, Address(rscratch1, 0));
8366   }
8367 }
8368 
8369 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
8370   // Used in sign-bit flipping with aligned address.
8371   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
8372   if (reachable(src)) {
8373     Assembler::xorps(dst, as_Address(src));
8374   } else {
8375     lea(rscratch1, src);
8376     Assembler::xorps(dst, Address(rscratch1, 0));
8377   }
8378 }
8379 
8380 // AVX 3-operands instructions
8381 
8382 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8383   if (reachable(src)) {
8384     vaddsd(dst, nds, as_Address(src));
8385   } else {
8386     lea(rscratch1, src);
8387     vaddsd(dst, nds, Address(rscratch1, 0));
8388   }
8389 }
8390 
8391 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8392   if (reachable(src)) {
8393     vaddss(dst, nds, as_Address(src));
8394   } else {
8395     lea(rscratch1, src);
8396     vaddss(dst, nds, Address(rscratch1, 0));
8397   }
8398 }
8399 
8400 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8401   if (reachable(src)) {
8402     vandpd(dst, nds, as_Address(src), vector256);
8403   } else {
8404     lea(rscratch1, src);
8405     vandpd(dst, nds, Address(rscratch1, 0), vector256);
8406   }
8407 }
8408 
8409 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8410   if (reachable(src)) {
8411     vandps(dst, nds, as_Address(src), vector256);
8412   } else {
8413     lea(rscratch1, src);
8414     vandps(dst, nds, Address(rscratch1, 0), vector256);
8415   }
8416 }
8417 
8418 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8419   if (reachable(src)) {
8420     vdivsd(dst, nds, as_Address(src));
8421   } else {
8422     lea(rscratch1, src);
8423     vdivsd(dst, nds, Address(rscratch1, 0));
8424   }
8425 }
8426 
8427 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8428   if (reachable(src)) {
8429     vdivss(dst, nds, as_Address(src));
8430   } else {
8431     lea(rscratch1, src);
8432     vdivss(dst, nds, Address(rscratch1, 0));
8433   }
8434 }
8435 
8436 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8437   if (reachable(src)) {
8438     vmulsd(dst, nds, as_Address(src));
8439   } else {
8440     lea(rscratch1, src);
8441     vmulsd(dst, nds, Address(rscratch1, 0));
8442   }
8443 }
8444 
8445 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8446   if (reachable(src)) {
8447     vmulss(dst, nds, as_Address(src));
8448   } else {
8449     lea(rscratch1, src);
8450     vmulss(dst, nds, Address(rscratch1, 0));
8451   }
8452 }
8453 
8454 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8455   if (reachable(src)) {
8456     vsubsd(dst, nds, as_Address(src));
8457   } else {
8458     lea(rscratch1, src);
8459     vsubsd(dst, nds, Address(rscratch1, 0));
8460   }
8461 }
8462 
8463 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8464   if (reachable(src)) {
8465     vsubss(dst, nds, as_Address(src));
8466   } else {
8467     lea(rscratch1, src);
8468     vsubss(dst, nds, Address(rscratch1, 0));
8469   }
8470 }
8471 
8472 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8473   if (reachable(src)) {
8474     vxorpd(dst, nds, as_Address(src), vector256);
8475   } else {
8476     lea(rscratch1, src);
8477     vxorpd(dst, nds, Address(rscratch1, 0), vector256);
8478   }
8479 }
8480 
8481 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8482   if (reachable(src)) {
8483     vxorps(dst, nds, as_Address(src), vector256);
8484   } else {
8485     lea(rscratch1, src);
8486     vxorps(dst, nds, Address(rscratch1, 0), vector256);
8487   }
8488 }
8489 
8490 
8491 //////////////////////////////////////////////////////////////////////////////////
8492 #ifndef SERIALGC
8493 
8494 void MacroAssembler::g1_write_barrier_pre(Register obj,
8495                                           Register pre_val,
8496                                           Register thread,
8497                                           Register tmp,
8498                                           bool tosca_live,
8499                                           bool expand_call) {
8500 
8501   // If expand_call is true then we expand the call_VM_leaf macro
8502   // directly to skip generating the check by
8503   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
8504 
8505 #ifdef _LP64
8506   assert(thread == r15_thread, "must be");
8507 #endif // _LP64
8508 
8509   Label done;
8510   Label runtime;
8511 
8512   assert(pre_val != noreg, "check this code");
8513 
8514   if (obj != noreg) {
8515     assert_different_registers(obj, pre_val, tmp);
8516     assert(pre_val != rax, "check this code");
8517   }
8518 
8519   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8520                                        PtrQueue::byte_offset_of_active()));
8521   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8522                                        PtrQueue::byte_offset_of_index()));
8523   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8524                                        PtrQueue::byte_offset_of_buf()));
8525 
8526 
8527   // Is marking active?
8528   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
8529     cmpl(in_progress, 0);
8530   } else {
8531     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
8532     cmpb(in_progress, 0);
8533   }
8534   jcc(Assembler::equal, done);
8535 
8536   // Do we need to load the previous value?
8537   if (obj != noreg) {
8538     load_heap_oop(pre_val, Address(obj, 0));
8539   }
8540 
8541   // Is the previous value null?
8542   cmpptr(pre_val, (int32_t) NULL_WORD);
8543   jcc(Assembler::equal, done);
8544 
8545   // Can we store original value in the thread's buffer?
8546   // Is index == 0?
8547   // (The index field is typed as size_t.)
8548 
8549   movptr(tmp, index);                   // tmp := *index_adr
8550   cmpptr(tmp, 0);                       // tmp == 0?
8551   jcc(Assembler::equal, runtime);       // If yes, goto runtime
8552 
8553   subptr(tmp, wordSize);                // tmp := tmp - wordSize
8554   movptr(index, tmp);                   // *index_adr := tmp
8555   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
8556 
8557   // Record the previous value
8558   movptr(Address(tmp, 0), pre_val);
8559   jmp(done);
8560 
8561   bind(runtime);
8562   // save the live input values
8563   if(tosca_live) push(rax);
8564 
8565   if (obj != noreg && obj != rax)
8566     push(obj);
8567 
8568   if (pre_val != rax)
8569     push(pre_val);
8570 
8571   // Calling the runtime using the regular call_VM_leaf mechanism generates
8572   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
8573   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
8574   //
8575   // If we care generating the pre-barrier without a frame (e.g. in the
8576   // intrinsified Reference.get() routine) then ebp might be pointing to
8577   // the caller frame and so this check will most likely fail at runtime.
8578   //
8579   // Expanding the call directly bypasses the generation of the check.
8580   // So when we do not have have a full interpreter frame on the stack
8581   // expand_call should be passed true.
8582 
8583   NOT_LP64( push(thread); )
8584 
8585   if (expand_call) {
8586     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
8587     pass_arg1(this, thread);
8588     pass_arg0(this, pre_val);
8589     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
8590   } else {
8591     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
8592   }
8593 
8594   NOT_LP64( pop(thread); )
8595 
8596   // save the live input values
8597   if (pre_val != rax)
8598     pop(pre_val);
8599 
8600   if (obj != noreg && obj != rax)
8601     pop(obj);
8602 
8603   if(tosca_live) pop(rax);
8604 
8605   bind(done);
8606 }
8607 
8608 void MacroAssembler::g1_write_barrier_post(Register store_addr,
8609                                            Register new_val,
8610                                            Register thread,
8611                                            Register tmp,
8612                                            Register tmp2) {
8613 #ifdef _LP64
8614   assert(thread == r15_thread, "must be");
8615 #endif // _LP64
8616 
8617   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8618                                        PtrQueue::byte_offset_of_index()));
8619   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8620                                        PtrQueue::byte_offset_of_buf()));
8621 
8622   BarrierSet* bs = Universe::heap()->barrier_set();
8623   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8624   Label done;
8625   Label runtime;
8626 
8627   // Does store cross heap regions?
8628 
8629   movptr(tmp, store_addr);
8630   xorptr(tmp, new_val);
8631   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
8632   jcc(Assembler::equal, done);
8633 
8634   // crosses regions, storing NULL?
8635 
8636   cmpptr(new_val, (int32_t) NULL_WORD);
8637   jcc(Assembler::equal, done);
8638 
8639   // storing region crossing non-NULL, is card already dirty?
8640 
8641   ExternalAddress cardtable((address) ct->byte_map_base);
8642   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8643 #ifdef _LP64
8644   const Register card_addr = tmp;
8645 
8646   movq(card_addr, store_addr);
8647   shrq(card_addr, CardTableModRefBS::card_shift);
8648 
8649   lea(tmp2, cardtable);
8650 
8651   // get the address of the card
8652   addq(card_addr, tmp2);
8653 #else
8654   const Register card_index = tmp;
8655 
8656   movl(card_index, store_addr);
8657   shrl(card_index, CardTableModRefBS::card_shift);
8658 
8659   Address index(noreg, card_index, Address::times_1);
8660   const Register card_addr = tmp;
8661   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
8662 #endif
8663   cmpb(Address(card_addr, 0), 0);
8664   jcc(Assembler::equal, done);
8665 
8666   // storing a region crossing, non-NULL oop, card is clean.
8667   // dirty card and log.
8668 
8669   movb(Address(card_addr, 0), 0);
8670 
8671   cmpl(queue_index, 0);
8672   jcc(Assembler::equal, runtime);
8673   subl(queue_index, wordSize);
8674   movptr(tmp2, buffer);
8675 #ifdef _LP64
8676   movslq(rscratch1, queue_index);
8677   addq(tmp2, rscratch1);
8678   movq(Address(tmp2, 0), card_addr);
8679 #else
8680   addl(tmp2, queue_index);
8681   movl(Address(tmp2, 0), card_index);
8682 #endif
8683   jmp(done);
8684 
8685   bind(runtime);
8686   // save the live input values
8687   push(store_addr);
8688   push(new_val);
8689 #ifdef _LP64
8690   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
8691 #else
8692   push(thread);
8693   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
8694   pop(thread);
8695 #endif
8696   pop(new_val);
8697   pop(store_addr);
8698 
8699   bind(done);
8700 }
8701 
8702 #endif // SERIALGC
8703 //////////////////////////////////////////////////////////////////////////////////
8704 
8705 
8706 void MacroAssembler::store_check(Register obj) {
8707   // Does a store check for the oop in register obj. The content of
8708   // register obj is destroyed afterwards.
8709   store_check_part_1(obj);
8710   store_check_part_2(obj);
8711 }
8712 
8713 void MacroAssembler::store_check(Register obj, Address dst) {
8714   store_check(obj);
8715 }
8716 
8717 
8718 // split the store check operation so that other instructions can be scheduled inbetween
8719 void MacroAssembler::store_check_part_1(Register obj) {
8720   BarrierSet* bs = Universe::heap()->barrier_set();
8721   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8722   shrptr(obj, CardTableModRefBS::card_shift);
8723 }
8724 
8725 void MacroAssembler::store_check_part_2(Register obj) {
8726   BarrierSet* bs = Universe::heap()->barrier_set();
8727   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8728   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8729   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8730 
8731   // The calculation for byte_map_base is as follows:
8732   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
8733   // So this essentially converts an address to a displacement and
8734   // it will never need to be relocated. On 64bit however the value may be too
8735   // large for a 32bit displacement
8736 
8737   intptr_t disp = (intptr_t) ct->byte_map_base;
8738   if (is_simm32(disp)) {
8739     Address cardtable(noreg, obj, Address::times_1, disp);
8740     movb(cardtable, 0);
8741   } else {
8742     // By doing it as an ExternalAddress disp could be converted to a rip-relative
8743     // displacement and done in a single instruction given favorable mapping and
8744     // a smarter version of as_Address. Worst case it is two instructions which
8745     // is no worse off then loading disp into a register and doing as a simple
8746     // Address() as above.
8747     // We can't do as ExternalAddress as the only style since if disp == 0 we'll
8748     // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
8749     // in some cases we'll get a single instruction version.
8750 
8751     ExternalAddress cardtable((address)disp);
8752     Address index(noreg, obj, Address::times_1);
8753     movb(as_Address(ArrayAddress(cardtable, index)), 0);
8754   }
8755 }
8756 
8757 void MacroAssembler::subptr(Register dst, int32_t imm32) {
8758   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
8759 }
8760 
8761 // Force generation of a 4 byte immediate value even if it fits into 8bit
8762 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
8763   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
8764 }
8765 
8766 void MacroAssembler::subptr(Register dst, Register src) {
8767   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
8768 }
8769 
8770 // C++ bool manipulation
8771 void MacroAssembler::testbool(Register dst) {
8772   if(sizeof(bool) == 1)
8773     testb(dst, 0xff);
8774   else if(sizeof(bool) == 2) {
8775     // testw implementation needed for two byte bools
8776     ShouldNotReachHere();
8777   } else if(sizeof(bool) == 4)
8778     testl(dst, dst);
8779   else
8780     // unsupported
8781     ShouldNotReachHere();
8782 }
8783 
8784 void MacroAssembler::testptr(Register dst, Register src) {
8785   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
8786 }
8787 
8788 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
8789 void MacroAssembler::tlab_allocate(Register obj,
8790                                    Register var_size_in_bytes,
8791                                    int con_size_in_bytes,
8792                                    Register t1,
8793                                    Register t2,
8794                                    Label& slow_case) {
8795   assert_different_registers(obj, t1, t2);
8796   assert_different_registers(obj, var_size_in_bytes, t1);
8797   Register end = t2;
8798   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
8799 
8800   verify_tlab();
8801 
8802   NOT_LP64(get_thread(thread));
8803 
8804   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
8805   if (var_size_in_bytes == noreg) {
8806     lea(end, Address(obj, con_size_in_bytes));
8807   } else {
8808     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
8809   }
8810   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
8811   jcc(Assembler::above, slow_case);
8812 
8813   // update the tlab top pointer
8814   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
8815 
8816   // recover var_size_in_bytes if necessary
8817   if (var_size_in_bytes == end) {
8818     subptr(var_size_in_bytes, obj);
8819   }
8820   verify_tlab();
8821 }
8822 
8823 // Preserves rbx, and rdx.
8824 Register MacroAssembler::tlab_refill(Label& retry,
8825                                      Label& try_eden,
8826                                      Label& slow_case) {
8827   Register top = rax;
8828   Register t1  = rcx;
8829   Register t2  = rsi;
8830   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
8831   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
8832   Label do_refill, discard_tlab;
8833 
8834   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
8835     // No allocation in the shared eden.
8836     jmp(slow_case);
8837   }
8838 
8839   NOT_LP64(get_thread(thread_reg));
8840 
8841   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8842   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8843 
8844   // calculate amount of free space
8845   subptr(t1, top);
8846   shrptr(t1, LogHeapWordSize);
8847 
8848   // Retain tlab and allocate object in shared space if
8849   // the amount free in the tlab is too large to discard.
8850   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
8851   jcc(Assembler::lessEqual, discard_tlab);
8852 
8853   // Retain
8854   // %%% yuck as movptr...
8855   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
8856   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
8857   if (TLABStats) {
8858     // increment number of slow_allocations
8859     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
8860   }
8861   jmp(try_eden);
8862 
8863   bind(discard_tlab);
8864   if (TLABStats) {
8865     // increment number of refills
8866     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
8867     // accumulate wastage -- t1 is amount free in tlab
8868     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
8869   }
8870 
8871   // if tlab is currently allocated (top or end != null) then
8872   // fill [top, end + alignment_reserve) with array object
8873   testptr(top, top);
8874   jcc(Assembler::zero, do_refill);
8875 
8876   // set up the mark word
8877   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
8878   // set the length to the remaining space
8879   subptr(t1, typeArrayOopDesc::header_size(T_INT));
8880   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
8881   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
8882   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
8883   // set klass to intArrayKlass
8884   // dubious reloc why not an oop reloc?
8885   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
8886   // store klass last.  concurrent gcs assumes klass length is valid if
8887   // klass field is not null.
8888   store_klass(top, t1);
8889 
8890   movptr(t1, top);
8891   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8892   incr_allocated_bytes(thread_reg, t1, 0);
8893 
8894   // refill the tlab with an eden allocation
8895   bind(do_refill);
8896   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8897   shlptr(t1, LogHeapWordSize);
8898   // allocate new tlab, address returned in top
8899   eden_allocate(top, t1, 0, t2, slow_case);
8900 
8901   // Check that t1 was preserved in eden_allocate.
8902 #ifdef ASSERT
8903   if (UseTLAB) {
8904     Label ok;
8905     Register tsize = rsi;
8906     assert_different_registers(tsize, thread_reg, t1);
8907     push(tsize);
8908     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8909     shlptr(tsize, LogHeapWordSize);
8910     cmpptr(t1, tsize);
8911     jcc(Assembler::equal, ok);
8912     STOP("assert(t1 != tlab size)");
8913     should_not_reach_here();
8914 
8915     bind(ok);
8916     pop(tsize);
8917   }
8918 #endif
8919   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
8920   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
8921   addptr(top, t1);
8922   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
8923   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
8924   verify_tlab();
8925   jmp(retry);
8926 
8927   return thread_reg; // for use by caller
8928 }
8929 
8930 void MacroAssembler::incr_allocated_bytes(Register thread,
8931                                           Register var_size_in_bytes,
8932                                           int con_size_in_bytes,
8933                                           Register t1) {
8934   if (!thread->is_valid()) {
8935 #ifdef _LP64
8936     thread = r15_thread;
8937 #else
8938     assert(t1->is_valid(), "need temp reg");
8939     thread = t1;
8940     get_thread(thread);
8941 #endif
8942   }
8943 
8944 #ifdef _LP64
8945   if (var_size_in_bytes->is_valid()) {
8946     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8947   } else {
8948     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8949   }
8950 #else
8951   if (var_size_in_bytes->is_valid()) {
8952     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8953   } else {
8954     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8955   }
8956   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
8957 #endif
8958 }
8959 
8960 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
8961   pusha();
8962 
8963   // if we are coming from c1, xmm registers may be live
8964   int off = 0;
8965   if (UseSSE == 1)  {
8966     subptr(rsp, sizeof(jdouble)*8);
8967     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
8968     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
8969     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
8970     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
8971     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
8972     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
8973     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
8974     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
8975   } else if (UseSSE >= 2)  {
8976 #ifdef COMPILER2
8977     if (MaxVectorSize > 16) {
8978       assert(UseAVX > 0, "256bit vectors are supported only with AVX");
8979       // Save upper half of YMM registes
8980       subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
8981       vextractf128h(Address(rsp,  0),xmm0);
8982       vextractf128h(Address(rsp, 16),xmm1);
8983       vextractf128h(Address(rsp, 32),xmm2);
8984       vextractf128h(Address(rsp, 48),xmm3);
8985       vextractf128h(Address(rsp, 64),xmm4);
8986       vextractf128h(Address(rsp, 80),xmm5);
8987       vextractf128h(Address(rsp, 96),xmm6);
8988       vextractf128h(Address(rsp,112),xmm7);
8989 #ifdef _LP64
8990       vextractf128h(Address(rsp,128),xmm8);
8991       vextractf128h(Address(rsp,144),xmm9);
8992       vextractf128h(Address(rsp,160),xmm10);
8993       vextractf128h(Address(rsp,176),xmm11);
8994       vextractf128h(Address(rsp,192),xmm12);
8995       vextractf128h(Address(rsp,208),xmm13);
8996       vextractf128h(Address(rsp,224),xmm14);
8997       vextractf128h(Address(rsp,240),xmm15);
8998 #endif
8999     }
9000 #endif
9001     // Save whole 128bit (16 bytes) XMM regiters
9002     subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
9003     movdqu(Address(rsp,off++*16),xmm0);
9004     movdqu(Address(rsp,off++*16),xmm1);
9005     movdqu(Address(rsp,off++*16),xmm2);
9006     movdqu(Address(rsp,off++*16),xmm3);
9007     movdqu(Address(rsp,off++*16),xmm4);
9008     movdqu(Address(rsp,off++*16),xmm5);
9009     movdqu(Address(rsp,off++*16),xmm6);
9010     movdqu(Address(rsp,off++*16),xmm7);
9011 #ifdef _LP64
9012     movdqu(Address(rsp,off++*16),xmm8);
9013     movdqu(Address(rsp,off++*16),xmm9);
9014     movdqu(Address(rsp,off++*16),xmm10);
9015     movdqu(Address(rsp,off++*16),xmm11);
9016     movdqu(Address(rsp,off++*16),xmm12);
9017     movdqu(Address(rsp,off++*16),xmm13);
9018     movdqu(Address(rsp,off++*16),xmm14);
9019     movdqu(Address(rsp,off++*16),xmm15);
9020 #endif
9021   }
9022 
9023   // Preserve registers across runtime call
9024   int incoming_argument_and_return_value_offset = -1;
9025   if (num_fpu_regs_in_use > 1) {
9026     // Must preserve all other FPU regs (could alternatively convert
9027     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
9028     // FPU state, but can not trust C compiler)
9029     NEEDS_CLEANUP;
9030     // NOTE that in this case we also push the incoming argument(s) to
9031     // the stack and restore it later; we also use this stack slot to
9032     // hold the return value from dsin, dcos etc.
9033     for (int i = 0; i < num_fpu_regs_in_use; i++) {
9034       subptr(rsp, sizeof(jdouble));
9035       fstp_d(Address(rsp, 0));
9036     }
9037     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
9038     for (int i = nb_args-1; i >= 0; i--) {
9039       fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
9040     }
9041   }
9042 
9043   subptr(rsp, nb_args*sizeof(jdouble));
9044   for (int i = 0; i < nb_args; i++) {
9045     fstp_d(Address(rsp, i*sizeof(jdouble)));
9046   }
9047 
9048 #ifdef _LP64
9049   if (nb_args > 0) {
9050     movdbl(xmm0, Address(rsp, 0));
9051   }
9052   if (nb_args > 1) {
9053     movdbl(xmm1, Address(rsp, sizeof(jdouble)));
9054   }
9055   assert(nb_args <= 2, "unsupported number of args");
9056 #endif // _LP64
9057 
9058   // NOTE: we must not use call_VM_leaf here because that requires a
9059   // complete interpreter frame in debug mode -- same bug as 4387334
9060   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
9061   // do proper 64bit abi
9062 
9063   NEEDS_CLEANUP;
9064   // Need to add stack banging before this runtime call if it needs to
9065   // be taken; however, there is no generic stack banging routine at
9066   // the MacroAssembler level
9067 
9068   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
9069 
9070 #ifdef _LP64
9071   movsd(Address(rsp, 0), xmm0);
9072   fld_d(Address(rsp, 0));
9073 #endif // _LP64
9074   addptr(rsp, sizeof(jdouble) * nb_args);
9075   if (num_fpu_regs_in_use > 1) {
9076     // Must save return value to stack and then restore entire FPU
9077     // stack except incoming arguments
9078     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
9079     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
9080       fld_d(Address(rsp, 0));
9081       addptr(rsp, sizeof(jdouble));
9082     }
9083     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
9084     addptr(rsp, sizeof(jdouble) * nb_args);
9085   }
9086 
9087   off = 0;
9088   if (UseSSE == 1)  {
9089     movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
9090     movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
9091     movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
9092     movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
9093     movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
9094     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
9095     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
9096     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
9097     addptr(rsp, sizeof(jdouble)*8);
9098   } else if (UseSSE >= 2)  {
9099     // Restore whole 128bit (16 bytes) XMM regiters
9100     movdqu(xmm0, Address(rsp,off++*16));
9101     movdqu(xmm1, Address(rsp,off++*16));
9102     movdqu(xmm2, Address(rsp,off++*16));
9103     movdqu(xmm3, Address(rsp,off++*16));
9104     movdqu(xmm4, Address(rsp,off++*16));
9105     movdqu(xmm5, Address(rsp,off++*16));
9106     movdqu(xmm6, Address(rsp,off++*16));
9107     movdqu(xmm7, Address(rsp,off++*16));
9108 #ifdef _LP64
9109     movdqu(xmm8, Address(rsp,off++*16));
9110     movdqu(xmm9, Address(rsp,off++*16));
9111     movdqu(xmm10, Address(rsp,off++*16));
9112     movdqu(xmm11, Address(rsp,off++*16));
9113     movdqu(xmm12, Address(rsp,off++*16));
9114     movdqu(xmm13, Address(rsp,off++*16));
9115     movdqu(xmm14, Address(rsp,off++*16));
9116     movdqu(xmm15, Address(rsp,off++*16));
9117 #endif
9118     addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
9119 #ifdef COMPILER2
9120     if (MaxVectorSize > 16) {
9121       // Restore upper half of YMM registes.
9122       vinsertf128h(xmm0, Address(rsp,  0));
9123       vinsertf128h(xmm1, Address(rsp, 16));
9124       vinsertf128h(xmm2, Address(rsp, 32));
9125       vinsertf128h(xmm3, Address(rsp, 48));
9126       vinsertf128h(xmm4, Address(rsp, 64));
9127       vinsertf128h(xmm5, Address(rsp, 80));
9128       vinsertf128h(xmm6, Address(rsp, 96));
9129       vinsertf128h(xmm7, Address(rsp,112));
9130 #ifdef _LP64
9131       vinsertf128h(xmm8, Address(rsp,128));
9132       vinsertf128h(xmm9, Address(rsp,144));
9133       vinsertf128h(xmm10, Address(rsp,160));
9134       vinsertf128h(xmm11, Address(rsp,176));
9135       vinsertf128h(xmm12, Address(rsp,192));
9136       vinsertf128h(xmm13, Address(rsp,208));
9137       vinsertf128h(xmm14, Address(rsp,224));
9138       vinsertf128h(xmm15, Address(rsp,240));
9139 #endif
9140       addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
9141     }
9142 #endif
9143   }
9144   popa();
9145 }
9146 
9147 static const double     pi_4 =  0.7853981633974483;
9148 
9149 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
9150   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
9151   // was attempted in this code; unfortunately it appears that the
9152   // switch to 80-bit precision and back causes this to be
9153   // unprofitable compared with simply performing a runtime call if
9154   // the argument is out of the (-pi/4, pi/4) range.
9155 
9156   Register tmp = noreg;
9157   if (!VM_Version::supports_cmov()) {
9158     // fcmp needs a temporary so preserve rbx,
9159     tmp = rbx;
9160     push(tmp);
9161   }
9162 
9163   Label slow_case, done;
9164 
9165   ExternalAddress pi4_adr = (address)&pi_4;
9166   if (reachable(pi4_adr)) {
9167     // x ?<= pi/4
9168     fld_d(pi4_adr);
9169     fld_s(1);                // Stack:  X  PI/4  X
9170     fabs();                  // Stack: |X| PI/4  X
9171     fcmp(tmp);
9172     jcc(Assembler::above, slow_case);
9173 
9174     // fastest case: -pi/4 <= x <= pi/4
9175     switch(trig) {
9176     case 's':
9177       fsin();
9178       break;
9179     case 'c':
9180       fcos();
9181       break;
9182     case 't':
9183       ftan();
9184       break;
9185     default:
9186       assert(false, "bad intrinsic");
9187       break;
9188     }
9189     jmp(done);
9190   }
9191 
9192   // slow case: runtime call
9193   bind(slow_case);
9194 
9195   switch(trig) {
9196   case 's':
9197     {
9198       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
9199     }
9200     break;
9201   case 'c':
9202     {
9203       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
9204     }
9205     break;
9206   case 't':
9207     {
9208       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
9209     }
9210     break;
9211   default:
9212     assert(false, "bad intrinsic");
9213     break;
9214   }
9215 
9216   // Come here with result in F-TOS
9217   bind(done);
9218 
9219   if (tmp != noreg) {
9220     pop(tmp);
9221   }
9222 }
9223 
9224 
9225 // Look up the method for a megamorphic invokeinterface call.
9226 // The target method is determined by <intf_klass, itable_index>.
9227 // The receiver klass is in recv_klass.
9228 // On success, the result will be in method_result, and execution falls through.
9229 // On failure, execution transfers to the given label.
9230 void MacroAssembler::lookup_interface_method(Register recv_klass,
9231                                              Register intf_klass,
9232                                              RegisterOrConstant itable_index,
9233                                              Register method_result,
9234                                              Register scan_temp,
9235                                              Label& L_no_such_interface) {
9236   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
9237   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
9238          "caller must use same register for non-constant itable index as for method");
9239 
9240   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
9241   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
9242   int itentry_off = itableMethodEntry::method_offset_in_bytes();
9243   int scan_step   = itableOffsetEntry::size() * wordSize;
9244   int vte_size    = vtableEntry::size() * wordSize;
9245   Address::ScaleFactor times_vte_scale = Address::times_ptr;
9246   assert(vte_size == wordSize, "else adjust times_vte_scale");
9247 
9248   movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
9249 
9250   // %%% Could store the aligned, prescaled offset in the klassoop.
9251   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
9252   if (HeapWordsPerLong > 1) {
9253     // Round up to align_object_offset boundary
9254     // see code for InstanceKlass::start_of_itable!
9255     round_to(scan_temp, BytesPerLong);
9256   }
9257 
9258   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
9259   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
9260   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
9261 
9262   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
9263   //   if (scan->interface() == intf) {
9264   //     result = (klass + scan->offset() + itable_index);
9265   //   }
9266   // }
9267   Label search, found_method;
9268 
9269   for (int peel = 1; peel >= 0; peel--) {
9270     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
9271     cmpptr(intf_klass, method_result);
9272 
9273     if (peel) {
9274       jccb(Assembler::equal, found_method);
9275     } else {
9276       jccb(Assembler::notEqual, search);
9277       // (invert the test to fall through to found_method...)
9278     }
9279 
9280     if (!peel)  break;
9281 
9282     bind(search);
9283 
9284     // Check that the previous entry is non-null.  A null entry means that
9285     // the receiver class doesn't implement the interface, and wasn't the
9286     // same as when the caller was compiled.
9287     testptr(method_result, method_result);
9288     jcc(Assembler::zero, L_no_such_interface);
9289     addptr(scan_temp, scan_step);
9290   }
9291 
9292   bind(found_method);
9293 
9294   // Got a hit.
9295   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
9296   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
9297 }
9298 
9299 
9300 // virtual method calling
9301 void MacroAssembler::lookup_virtual_method(Register recv_klass,
9302                                            RegisterOrConstant vtable_index,
9303                                            Register method_result) {
9304   const int base = InstanceKlass::vtable_start_offset() * wordSize;
9305   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
9306   Address vtable_entry_addr(recv_klass,
9307                             vtable_index, Address::times_ptr,
9308                             base + vtableEntry::method_offset_in_bytes());
9309   movptr(method_result, vtable_entry_addr);
9310 }
9311 
9312 
9313 void MacroAssembler::check_klass_subtype(Register sub_klass,
9314                            Register super_klass,
9315                            Register temp_reg,
9316                            Label& L_success) {
9317   Label L_failure;
9318   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
9319   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
9320   bind(L_failure);
9321 }
9322 
9323 
9324 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
9325                                                    Register super_klass,
9326                                                    Register temp_reg,
9327                                                    Label* L_success,
9328                                                    Label* L_failure,
9329                                                    Label* L_slow_path,
9330                                         RegisterOrConstant super_check_offset) {
9331   assert_different_registers(sub_klass, super_klass, temp_reg);
9332   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
9333   if (super_check_offset.is_register()) {
9334     assert_different_registers(sub_klass, super_klass,
9335                                super_check_offset.as_register());
9336   } else if (must_load_sco) {
9337     assert(temp_reg != noreg, "supply either a temp or a register offset");
9338   }
9339 
9340   Label L_fallthrough;
9341   int label_nulls = 0;
9342   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
9343   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
9344   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
9345   assert(label_nulls <= 1, "at most one NULL in the batch");
9346 
9347   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
9348   int sco_offset = in_bytes(Klass::super_check_offset_offset());
9349   Address super_check_offset_addr(super_klass, sco_offset);
9350 
9351   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
9352   // range of a jccb.  If this routine grows larger, reconsider at
9353   // least some of these.
9354 #define local_jcc(assembler_cond, label)                                \
9355   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
9356   else                             jcc( assembler_cond, label) /*omit semi*/
9357 
9358   // Hacked jmp, which may only be used just before L_fallthrough.
9359 #define final_jmp(label)                                                \
9360   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
9361   else                            jmp(label)                /*omit semi*/
9362 
9363   // If the pointers are equal, we are done (e.g., String[] elements).
9364   // This self-check enables sharing of secondary supertype arrays among
9365   // non-primary types such as array-of-interface.  Otherwise, each such
9366   // type would need its own customized SSA.
9367   // We move this check to the front of the fast path because many
9368   // type checks are in fact trivially successful in this manner,
9369   // so we get a nicely predicted branch right at the start of the check.
9370   cmpptr(sub_klass, super_klass);
9371   local_jcc(Assembler::equal, *L_success);
9372 
9373   // Check the supertype display:
9374   if (must_load_sco) {
9375     // Positive movl does right thing on LP64.
9376     movl(temp_reg, super_check_offset_addr);
9377     super_check_offset = RegisterOrConstant(temp_reg);
9378   }
9379   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
9380   cmpptr(super_klass, super_check_addr); // load displayed supertype
9381 
9382   // This check has worked decisively for primary supers.
9383   // Secondary supers are sought in the super_cache ('super_cache_addr').
9384   // (Secondary supers are interfaces and very deeply nested subtypes.)
9385   // This works in the same check above because of a tricky aliasing
9386   // between the super_cache and the primary super display elements.
9387   // (The 'super_check_addr' can address either, as the case requires.)
9388   // Note that the cache is updated below if it does not help us find
9389   // what we need immediately.
9390   // So if it was a primary super, we can just fail immediately.
9391   // Otherwise, it's the slow path for us (no success at this point).
9392 
9393   if (super_check_offset.is_register()) {
9394     local_jcc(Assembler::equal, *L_success);
9395     cmpl(super_check_offset.as_register(), sc_offset);
9396     if (L_failure == &L_fallthrough) {
9397       local_jcc(Assembler::equal, *L_slow_path);
9398     } else {
9399       local_jcc(Assembler::notEqual, *L_failure);
9400       final_jmp(*L_slow_path);
9401     }
9402   } else if (super_check_offset.as_constant() == sc_offset) {
9403     // Need a slow path; fast failure is impossible.
9404     if (L_slow_path == &L_fallthrough) {
9405       local_jcc(Assembler::equal, *L_success);
9406     } else {
9407       local_jcc(Assembler::notEqual, *L_slow_path);
9408       final_jmp(*L_success);
9409     }
9410   } else {
9411     // No slow path; it's a fast decision.
9412     if (L_failure == &L_fallthrough) {
9413       local_jcc(Assembler::equal, *L_success);
9414     } else {
9415       local_jcc(Assembler::notEqual, *L_failure);
9416       final_jmp(*L_success);
9417     }
9418   }
9419 
9420   bind(L_fallthrough);
9421 
9422 #undef local_jcc
9423 #undef final_jmp
9424 }
9425 
9426 
9427 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
9428                                                    Register super_klass,
9429                                                    Register temp_reg,
9430                                                    Register temp2_reg,
9431                                                    Label* L_success,
9432                                                    Label* L_failure,
9433                                                    bool set_cond_codes) {
9434   assert_different_registers(sub_klass, super_klass, temp_reg);
9435   if (temp2_reg != noreg)
9436     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
9437 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
9438 
9439   Label L_fallthrough;
9440   int label_nulls = 0;
9441   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
9442   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
9443   assert(label_nulls <= 1, "at most one NULL in the batch");
9444 
9445   // a couple of useful fields in sub_klass:
9446   int ss_offset = in_bytes(Klass::secondary_supers_offset());
9447   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
9448   Address secondary_supers_addr(sub_klass, ss_offset);
9449   Address super_cache_addr(     sub_klass, sc_offset);
9450 
9451   // Do a linear scan of the secondary super-klass chain.
9452   // This code is rarely used, so simplicity is a virtue here.
9453   // The repne_scan instruction uses fixed registers, which we must spill.
9454   // Don't worry too much about pre-existing connections with the input regs.
9455 
9456   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
9457   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
9458 
9459   // Get super_klass value into rax (even if it was in rdi or rcx).
9460   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
9461   if (super_klass != rax || UseCompressedOops) {
9462     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
9463     mov(rax, super_klass);
9464   }
9465   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
9466   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
9467 
9468 #ifndef PRODUCT
9469   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
9470   ExternalAddress pst_counter_addr((address) pst_counter);
9471   NOT_LP64(  incrementl(pst_counter_addr) );
9472   LP64_ONLY( lea(rcx, pst_counter_addr) );
9473   LP64_ONLY( incrementl(Address(rcx, 0)) );
9474 #endif //PRODUCT
9475 
9476   // We will consult the secondary-super array.
9477   movptr(rdi, secondary_supers_addr);
9478   // Load the array length.  (Positive movl does right thing on LP64.)
9479   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
9480   // Skip to start of data.
9481   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
9482 
9483   // Scan RCX words at [RDI] for an occurrence of RAX.
9484   // Set NZ/Z based on last compare.
9485   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
9486   // not change flags (only scas instruction which is repeated sets flags).
9487   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
9488 
9489     testptr(rax,rax); // Set Z = 0
9490     repne_scan();
9491 
9492   // Unspill the temp. registers:
9493   if (pushed_rdi)  pop(rdi);
9494   if (pushed_rcx)  pop(rcx);
9495   if (pushed_rax)  pop(rax);
9496 
9497   if (set_cond_codes) {
9498     // Special hack for the AD files:  rdi is guaranteed non-zero.
9499     assert(!pushed_rdi, "rdi must be left non-NULL");
9500     // Also, the condition codes are properly set Z/NZ on succeed/failure.
9501   }
9502 
9503   if (L_failure == &L_fallthrough)
9504         jccb(Assembler::notEqual, *L_failure);
9505   else  jcc(Assembler::notEqual, *L_failure);
9506 
9507   // Success.  Cache the super we found and proceed in triumph.
9508   movptr(super_cache_addr, super_klass);
9509 
9510   if (L_success != &L_fallthrough) {
9511     jmp(*L_success);
9512   }
9513 
9514 #undef IS_A_TEMP
9515 
9516   bind(L_fallthrough);
9517 }
9518 
9519 
9520 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
9521   if (VM_Version::supports_cmov()) {
9522     cmovl(cc, dst, src);
9523   } else {
9524     Label L;
9525     jccb(negate_condition(cc), L);
9526     movl(dst, src);
9527     bind(L);
9528   }
9529 }
9530 
9531 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
9532   if (VM_Version::supports_cmov()) {
9533     cmovl(cc, dst, src);
9534   } else {
9535     Label L;
9536     jccb(negate_condition(cc), L);
9537     movl(dst, src);
9538     bind(L);
9539   }
9540 }
9541 
9542 void MacroAssembler::verify_oop(Register reg, const char* s) {
9543   if (!VerifyOops) return;
9544 
9545   // Pass register number to verify_oop_subroutine
9546   char* b = new char[strlen(s) + 50];
9547   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
9548   BLOCK_COMMENT("verify_oop {");
9549 #ifdef _LP64
9550   push(rscratch1);                    // save r10, trashed by movptr()
9551 #endif
9552   push(rax);                          // save rax,
9553   push(reg);                          // pass register argument
9554   ExternalAddress buffer((address) b);
9555   // avoid using pushptr, as it modifies scratch registers
9556   // and our contract is not to modify anything
9557   movptr(rax, buffer.addr());
9558   push(rax);
9559   // call indirectly to solve generation ordering problem
9560   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9561   call(rax);
9562   // Caller pops the arguments (oop, message) and restores rax, r10
9563   BLOCK_COMMENT("} verify_oop");
9564 }
9565 
9566 
9567 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
9568                                                       Register tmp,
9569                                                       int offset) {
9570   intptr_t value = *delayed_value_addr;
9571   if (value != 0)
9572     return RegisterOrConstant(value + offset);
9573 
9574   // load indirectly to solve generation ordering problem
9575   movptr(tmp, ExternalAddress((address) delayed_value_addr));
9576 
9577 #ifdef ASSERT
9578   { Label L;
9579     testptr(tmp, tmp);
9580     if (WizardMode) {
9581       jcc(Assembler::notZero, L);
9582       char* buf = new char[40];
9583       sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
9584       STOP(buf);
9585     } else {
9586       jccb(Assembler::notZero, L);
9587       hlt();
9588     }
9589     bind(L);
9590   }
9591 #endif
9592 
9593   if (offset != 0)
9594     addptr(tmp, offset);
9595 
9596   return RegisterOrConstant(tmp);
9597 }
9598 
9599 
9600 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
9601                                          int extra_slot_offset) {
9602   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
9603   int stackElementSize = Interpreter::stackElementSize;
9604   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
9605 #ifdef ASSERT
9606   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
9607   assert(offset1 - offset == stackElementSize, "correct arithmetic");
9608 #endif
9609   Register             scale_reg    = noreg;
9610   Address::ScaleFactor scale_factor = Address::no_scale;
9611   if (arg_slot.is_constant()) {
9612     offset += arg_slot.as_constant() * stackElementSize;
9613   } else {
9614     scale_reg    = arg_slot.as_register();
9615     scale_factor = Address::times(stackElementSize);
9616   }
9617   offset += wordSize;           // return PC is on stack
9618   return Address(rsp, scale_reg, scale_factor, offset);
9619 }
9620 
9621 
9622 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
9623   if (!VerifyOops) return;
9624 
9625   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
9626   // Pass register number to verify_oop_subroutine
9627   char* b = new char[strlen(s) + 50];
9628   sprintf(b, "verify_oop_addr: %s", s);
9629 
9630 #ifdef _LP64
9631   push(rscratch1);                    // save r10, trashed by movptr()
9632 #endif
9633   push(rax);                          // save rax,
9634   // addr may contain rsp so we will have to adjust it based on the push
9635   // we just did (and on 64 bit we do two pushes)
9636   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
9637   // stores rax into addr which is backwards of what was intended.
9638   if (addr.uses(rsp)) {
9639     lea(rax, addr);
9640     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
9641   } else {
9642     pushptr(addr);
9643   }
9644 
9645   ExternalAddress buffer((address) b);
9646   // pass msg argument
9647   // avoid using pushptr, as it modifies scratch registers
9648   // and our contract is not to modify anything
9649   movptr(rax, buffer.addr());
9650   push(rax);
9651 
9652   // call indirectly to solve generation ordering problem
9653   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9654   call(rax);
9655   // Caller pops the arguments (addr, message) and restores rax, r10.
9656 }
9657 
9658 void MacroAssembler::verify_tlab() {
9659 #ifdef ASSERT
9660   if (UseTLAB && VerifyOops) {
9661     Label next, ok;
9662     Register t1 = rsi;
9663     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
9664 
9665     push(t1);
9666     NOT_LP64(push(thread_reg));
9667     NOT_LP64(get_thread(thread_reg));
9668 
9669     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9670     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
9671     jcc(Assembler::aboveEqual, next);
9672     STOP("assert(top >= start)");
9673     should_not_reach_here();
9674 
9675     bind(next);
9676     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
9677     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9678     jcc(Assembler::aboveEqual, ok);
9679     STOP("assert(top <= end)");
9680     should_not_reach_here();
9681 
9682     bind(ok);
9683     NOT_LP64(pop(thread_reg));
9684     pop(t1);
9685   }
9686 #endif
9687 }
9688 
9689 class ControlWord {
9690  public:
9691   int32_t _value;
9692 
9693   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
9694   int  precision_control() const       { return  (_value >>  8) & 3      ; }
9695   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9696   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9697   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9698   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9699   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9700   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9701 
9702   void print() const {
9703     // rounding control
9704     const char* rc;
9705     switch (rounding_control()) {
9706       case 0: rc = "round near"; break;
9707       case 1: rc = "round down"; break;
9708       case 2: rc = "round up  "; break;
9709       case 3: rc = "chop      "; break;
9710     };
9711     // precision control
9712     const char* pc;
9713     switch (precision_control()) {
9714       case 0: pc = "24 bits "; break;
9715       case 1: pc = "reserved"; break;
9716       case 2: pc = "53 bits "; break;
9717       case 3: pc = "64 bits "; break;
9718     };
9719     // flags
9720     char f[9];
9721     f[0] = ' ';
9722     f[1] = ' ';
9723     f[2] = (precision   ()) ? 'P' : 'p';
9724     f[3] = (underflow   ()) ? 'U' : 'u';
9725     f[4] = (overflow    ()) ? 'O' : 'o';
9726     f[5] = (zero_divide ()) ? 'Z' : 'z';
9727     f[6] = (denormalized()) ? 'D' : 'd';
9728     f[7] = (invalid     ()) ? 'I' : 'i';
9729     f[8] = '\x0';
9730     // output
9731     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
9732   }
9733 
9734 };
9735 
9736 class StatusWord {
9737  public:
9738   int32_t _value;
9739 
9740   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
9741   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
9742   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
9743   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
9744   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
9745   int  top() const                     { return  (_value >> 11) & 7      ; }
9746   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
9747   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
9748   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9749   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9750   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9751   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9752   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9753   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9754 
9755   void print() const {
9756     // condition codes
9757     char c[5];
9758     c[0] = (C3()) ? '3' : '-';
9759     c[1] = (C2()) ? '2' : '-';
9760     c[2] = (C1()) ? '1' : '-';
9761     c[3] = (C0()) ? '0' : '-';
9762     c[4] = '\x0';
9763     // flags
9764     char f[9];
9765     f[0] = (error_status()) ? 'E' : '-';
9766     f[1] = (stack_fault ()) ? 'S' : '-';
9767     f[2] = (precision   ()) ? 'P' : '-';
9768     f[3] = (underflow   ()) ? 'U' : '-';
9769     f[4] = (overflow    ()) ? 'O' : '-';
9770     f[5] = (zero_divide ()) ? 'Z' : '-';
9771     f[6] = (denormalized()) ? 'D' : '-';
9772     f[7] = (invalid     ()) ? 'I' : '-';
9773     f[8] = '\x0';
9774     // output
9775     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
9776   }
9777 
9778 };
9779 
9780 class TagWord {
9781  public:
9782   int32_t _value;
9783 
9784   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
9785 
9786   void print() const {
9787     printf("%04x", _value & 0xFFFF);
9788   }
9789 
9790 };
9791 
9792 class FPU_Register {
9793  public:
9794   int32_t _m0;
9795   int32_t _m1;
9796   int16_t _ex;
9797 
9798   bool is_indefinite() const           {
9799     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
9800   }
9801 
9802   void print() const {
9803     char  sign = (_ex < 0) ? '-' : '+';
9804     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
9805     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
9806   };
9807 
9808 };
9809 
9810 class FPU_State {
9811  public:
9812   enum {
9813     register_size       = 10,
9814     number_of_registers =  8,
9815     register_mask       =  7
9816   };
9817 
9818   ControlWord  _control_word;
9819   StatusWord   _status_word;
9820   TagWord      _tag_word;
9821   int32_t      _error_offset;
9822   int32_t      _error_selector;
9823   int32_t      _data_offset;
9824   int32_t      _data_selector;
9825   int8_t       _register[register_size * number_of_registers];
9826 
9827   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
9828   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
9829 
9830   const char* tag_as_string(int tag) const {
9831     switch (tag) {
9832       case 0: return "valid";
9833       case 1: return "zero";
9834       case 2: return "special";
9835       case 3: return "empty";
9836     }
9837     ShouldNotReachHere();
9838     return NULL;
9839   }
9840 
9841   void print() const {
9842     // print computation registers
9843     { int t = _status_word.top();
9844       for (int i = 0; i < number_of_registers; i++) {
9845         int j = (i - t) & register_mask;
9846         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
9847         st(j)->print();
9848         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
9849       }
9850     }
9851     printf("\n");
9852     // print control registers
9853     printf("ctrl = "); _control_word.print(); printf("\n");
9854     printf("stat = "); _status_word .print(); printf("\n");
9855     printf("tags = "); _tag_word    .print(); printf("\n");
9856   }
9857 
9858 };
9859 
9860 class Flag_Register {
9861  public:
9862   int32_t _value;
9863 
9864   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
9865   bool direction() const               { return ((_value >> 10) & 1) != 0; }
9866   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
9867   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
9868   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
9869   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
9870   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
9871 
9872   void print() const {
9873     // flags
9874     char f[8];
9875     f[0] = (overflow       ()) ? 'O' : '-';
9876     f[1] = (direction      ()) ? 'D' : '-';
9877     f[2] = (sign           ()) ? 'S' : '-';
9878     f[3] = (zero           ()) ? 'Z' : '-';
9879     f[4] = (auxiliary_carry()) ? 'A' : '-';
9880     f[5] = (parity         ()) ? 'P' : '-';
9881     f[6] = (carry          ()) ? 'C' : '-';
9882     f[7] = '\x0';
9883     // output
9884     printf("%08x  flags = %s", _value, f);
9885   }
9886 
9887 };
9888 
9889 class IU_Register {
9890  public:
9891   int32_t _value;
9892 
9893   void print() const {
9894     printf("%08x  %11d", _value, _value);
9895   }
9896 
9897 };
9898 
9899 class IU_State {
9900  public:
9901   Flag_Register _eflags;
9902   IU_Register   _rdi;
9903   IU_Register   _rsi;
9904   IU_Register   _rbp;
9905   IU_Register   _rsp;
9906   IU_Register   _rbx;
9907   IU_Register   _rdx;
9908   IU_Register   _rcx;
9909   IU_Register   _rax;
9910 
9911   void print() const {
9912     // computation registers
9913     printf("rax,  = "); _rax.print(); printf("\n");
9914     printf("rbx,  = "); _rbx.print(); printf("\n");
9915     printf("rcx  = "); _rcx.print(); printf("\n");
9916     printf("rdx  = "); _rdx.print(); printf("\n");
9917     printf("rdi  = "); _rdi.print(); printf("\n");
9918     printf("rsi  = "); _rsi.print(); printf("\n");
9919     printf("rbp,  = "); _rbp.print(); printf("\n");
9920     printf("rsp  = "); _rsp.print(); printf("\n");
9921     printf("\n");
9922     // control registers
9923     printf("flgs = "); _eflags.print(); printf("\n");
9924   }
9925 };
9926 
9927 
9928 class CPU_State {
9929  public:
9930   FPU_State _fpu_state;
9931   IU_State  _iu_state;
9932 
9933   void print() const {
9934     printf("--------------------------------------------------\n");
9935     _iu_state .print();
9936     printf("\n");
9937     _fpu_state.print();
9938     printf("--------------------------------------------------\n");
9939   }
9940 
9941 };
9942 
9943 
9944 static void _print_CPU_state(CPU_State* state) {
9945   state->print();
9946 };
9947 
9948 
9949 void MacroAssembler::print_CPU_state() {
9950   push_CPU_state();
9951   push(rsp);                // pass CPU state
9952   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
9953   addptr(rsp, wordSize);       // discard argument
9954   pop_CPU_state();
9955 }
9956 
9957 
9958 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
9959   static int counter = 0;
9960   FPU_State* fs = &state->_fpu_state;
9961   counter++;
9962   // For leaf calls, only verify that the top few elements remain empty.
9963   // We only need 1 empty at the top for C2 code.
9964   if( stack_depth < 0 ) {
9965     if( fs->tag_for_st(7) != 3 ) {
9966       printf("FPR7 not empty\n");
9967       state->print();
9968       assert(false, "error");
9969       return false;
9970     }
9971     return true;                // All other stack states do not matter
9972   }
9973 
9974   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
9975          "bad FPU control word");
9976 
9977   // compute stack depth
9978   int i = 0;
9979   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
9980   int d = i;
9981   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
9982   // verify findings
9983   if (i != FPU_State::number_of_registers) {
9984     // stack not contiguous
9985     printf("%s: stack not contiguous at ST%d\n", s, i);
9986     state->print();
9987     assert(false, "error");
9988     return false;
9989   }
9990   // check if computed stack depth corresponds to expected stack depth
9991   if (stack_depth < 0) {
9992     // expected stack depth is -stack_depth or less
9993     if (d > -stack_depth) {
9994       // too many elements on the stack
9995       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
9996       state->print();
9997       assert(false, "error");
9998       return false;
9999     }
10000   } else {
10001     // expected stack depth is stack_depth
10002     if (d != stack_depth) {
10003       // wrong stack depth
10004       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
10005       state->print();
10006       assert(false, "error");
10007       return false;
10008     }
10009   }
10010   // everything is cool
10011   return true;
10012 }
10013 
10014 
10015 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
10016   if (!VerifyFPU) return;
10017   push_CPU_state();
10018   push(rsp);                // pass CPU state
10019   ExternalAddress msg((address) s);
10020   // pass message string s
10021   pushptr(msg.addr());
10022   push(stack_depth);        // pass stack depth
10023   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
10024   addptr(rsp, 3 * wordSize);   // discard arguments
10025   // check for error
10026   { Label L;
10027     testl(rax, rax);
10028     jcc(Assembler::notZero, L);
10029     int3();                  // break if error condition
10030     bind(L);
10031   }
10032   pop_CPU_state();
10033 }
10034 
10035 void MacroAssembler::load_klass(Register dst, Register src) {
10036 #ifdef _LP64
10037   if (UseCompressedKlassPointers) {
10038     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10039     decode_heap_oop_not_null(dst);
10040   } else
10041 #endif
10042     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10043 }
10044 
10045 void MacroAssembler::load_prototype_header(Register dst, Register src) {
10046 #ifdef _LP64
10047   if (UseCompressedKlassPointers) {
10048     assert (Universe::heap() != NULL, "java heap should be initialized");
10049     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10050     if (Universe::narrow_oop_shift() != 0) {
10051       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10052       if (LogMinObjAlignmentInBytes == Address::times_8) {
10053         movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
10054       } else {
10055         // OK to use shift since we don't need to preserve flags.
10056         shlq(dst, LogMinObjAlignmentInBytes);
10057         movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset()));
10058       }
10059     } else {
10060       movq(dst, Address(dst, Klass::prototype_header_offset()));
10061     }
10062   } else
10063 #endif
10064   {
10065     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
10066     movptr(dst, Address(dst, Klass::prototype_header_offset()));
10067   }
10068 }
10069 
10070 void MacroAssembler::store_klass(Register dst, Register src) {
10071 #ifdef _LP64
10072   if (UseCompressedKlassPointers) {
10073     encode_heap_oop_not_null(src);
10074     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
10075   } else
10076 #endif
10077     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
10078 }
10079 
10080 void MacroAssembler::load_heap_oop(Register dst, Address src) {
10081 #ifdef _LP64
10082   // FIXME: Must change all places where we try to load the klass.
10083   if (UseCompressedOops) {
10084     movl(dst, src);
10085     decode_heap_oop(dst);
10086   } else
10087 #endif
10088     movptr(dst, src);
10089 }
10090 
10091 // Doesn't do verfication, generates fixed size code
10092 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
10093 #ifdef _LP64
10094   if (UseCompressedOops) {
10095     movl(dst, src);
10096     decode_heap_oop_not_null(dst);
10097   } else
10098 #endif
10099     movptr(dst, src);
10100 }
10101 
10102 void MacroAssembler::store_heap_oop(Address dst, Register src) {
10103 #ifdef _LP64
10104   if (UseCompressedOops) {
10105     assert(!dst.uses(src), "not enough registers");
10106     encode_heap_oop(src);
10107     movl(dst, src);
10108   } else
10109 #endif
10110     movptr(dst, src);
10111 }
10112 
10113 void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
10114   assert_different_registers(src1, tmp);
10115 #ifdef _LP64
10116   if (UseCompressedOops) {
10117     bool did_push = false;
10118     if (tmp == noreg) {
10119       tmp = rax;
10120       push(tmp);
10121       did_push = true;
10122       assert(!src2.uses(rsp), "can't push");
10123     }
10124     load_heap_oop(tmp, src2);
10125     cmpptr(src1, tmp);
10126     if (did_push)  pop(tmp);
10127   } else
10128 #endif
10129     cmpptr(src1, src2);
10130 }
10131 
10132 // Used for storing NULLs.
10133 void MacroAssembler::store_heap_oop_null(Address dst) {
10134 #ifdef _LP64
10135   if (UseCompressedOops) {
10136     movl(dst, (int32_t)NULL_WORD);
10137   } else {
10138     movslq(dst, (int32_t)NULL_WORD);
10139   }
10140 #else
10141   movl(dst, (int32_t)NULL_WORD);
10142 #endif
10143 }
10144 
10145 #ifdef _LP64
10146 void MacroAssembler::store_klass_gap(Register dst, Register src) {
10147   if (UseCompressedKlassPointers) {
10148     // Store to klass gap in destination
10149     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
10150   }
10151 }
10152 
10153 #ifdef ASSERT
10154 void MacroAssembler::verify_heapbase(const char* msg) {
10155   assert (UseCompressedOops, "should be compressed");
10156   assert (Universe::heap() != NULL, "java heap should be initialized");
10157   if (CheckCompressedOops) {
10158     Label ok;
10159     push(rscratch1); // cmpptr trashes rscratch1
10160     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
10161     jcc(Assembler::equal, ok);
10162     STOP(msg);
10163     bind(ok);
10164     pop(rscratch1);
10165   }
10166 }
10167 #endif
10168 
10169 // Algorithm must match oop.inline.hpp encode_heap_oop.
10170 void MacroAssembler::encode_heap_oop(Register r) {
10171 #ifdef ASSERT
10172   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
10173 #endif
10174   verify_oop(r, "broken oop in encode_heap_oop");
10175   if (Universe::narrow_oop_base() == NULL) {
10176     if (Universe::narrow_oop_shift() != 0) {
10177       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10178       shrq(r, LogMinObjAlignmentInBytes);
10179     }
10180     return;
10181   }
10182   testq(r, r);
10183   cmovq(Assembler::equal, r, r12_heapbase);
10184   subq(r, r12_heapbase);
10185   shrq(r, LogMinObjAlignmentInBytes);
10186 }
10187 
10188 void MacroAssembler::encode_heap_oop_not_null(Register r) {
10189 #ifdef ASSERT
10190   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
10191   if (CheckCompressedOops) {
10192     Label ok;
10193     testq(r, r);
10194     jcc(Assembler::notEqual, ok);
10195     STOP("null oop passed to encode_heap_oop_not_null");
10196     bind(ok);
10197   }
10198 #endif
10199   verify_oop(r, "broken oop in encode_heap_oop_not_null");
10200   if (Universe::narrow_oop_base() != NULL) {
10201     subq(r, r12_heapbase);
10202   }
10203   if (Universe::narrow_oop_shift() != 0) {
10204     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10205     shrq(r, LogMinObjAlignmentInBytes);
10206   }
10207 }
10208 
10209 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
10210 #ifdef ASSERT
10211   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
10212   if (CheckCompressedOops) {
10213     Label ok;
10214     testq(src, src);
10215     jcc(Assembler::notEqual, ok);
10216     STOP("null oop passed to encode_heap_oop_not_null2");
10217     bind(ok);
10218   }
10219 #endif
10220   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
10221   if (dst != src) {
10222     movq(dst, src);
10223   }
10224   if (Universe::narrow_oop_base() != NULL) {
10225     subq(dst, r12_heapbase);
10226   }
10227   if (Universe::narrow_oop_shift() != 0) {
10228     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10229     shrq(dst, LogMinObjAlignmentInBytes);
10230   }
10231 }
10232 
10233 void  MacroAssembler::decode_heap_oop(Register r) {
10234 #ifdef ASSERT
10235   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
10236 #endif
10237   if (Universe::narrow_oop_base() == NULL) {
10238     if (Universe::narrow_oop_shift() != 0) {
10239       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10240       shlq(r, LogMinObjAlignmentInBytes);
10241     }
10242   } else {
10243     Label done;
10244     shlq(r, LogMinObjAlignmentInBytes);
10245     jccb(Assembler::equal, done);
10246     addq(r, r12_heapbase);
10247     bind(done);
10248   }
10249   verify_oop(r, "broken oop in decode_heap_oop");
10250 }
10251 
10252 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
10253   // Note: it will change flags
10254   assert (UseCompressedOops, "should only be used for compressed headers");
10255   assert (Universe::heap() != NULL, "java heap should be initialized");
10256   // Cannot assert, unverified entry point counts instructions (see .ad file)
10257   // vtableStubs also counts instructions in pd_code_size_limit.
10258   // Also do not verify_oop as this is called by verify_oop.
10259   if (Universe::narrow_oop_shift() != 0) {
10260     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10261     shlq(r, LogMinObjAlignmentInBytes);
10262     if (Universe::narrow_oop_base() != NULL) {
10263       addq(r, r12_heapbase);
10264     }
10265   } else {
10266     assert (Universe::narrow_oop_base() == NULL, "sanity");
10267   }
10268 }
10269 
10270 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
10271   // Note: it will change flags
10272   assert (UseCompressedOops, "should only be used for compressed headers");
10273   assert (Universe::heap() != NULL, "java heap should be initialized");
10274   // Cannot assert, unverified entry point counts instructions (see .ad file)
10275   // vtableStubs also counts instructions in pd_code_size_limit.
10276   // Also do not verify_oop as this is called by verify_oop.
10277   if (Universe::narrow_oop_shift() != 0) {
10278     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10279     if (LogMinObjAlignmentInBytes == Address::times_8) {
10280       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
10281     } else {
10282       if (dst != src) {
10283         movq(dst, src);
10284       }
10285       shlq(dst, LogMinObjAlignmentInBytes);
10286       if (Universe::narrow_oop_base() != NULL) {
10287         addq(dst, r12_heapbase);
10288       }
10289     }
10290   } else {
10291     assert (Universe::narrow_oop_base() == NULL, "sanity");
10292     if (dst != src) {
10293       movq(dst, src);
10294     }
10295   }
10296 }
10297 
10298 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
10299   assert (UseCompressedOops, "should only be used for compressed headers");
10300   assert (Universe::heap() != NULL, "java heap should be initialized");
10301   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10302   int oop_index = oop_recorder()->find_index(obj);
10303   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10304   mov_narrow_oop(dst, oop_index, rspec);
10305 }
10306 
10307 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
10308   assert (UseCompressedOops, "should only be used for compressed headers");
10309   assert (Universe::heap() != NULL, "java heap should be initialized");
10310   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10311   int oop_index = oop_recorder()->find_index(obj);
10312   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10313   mov_narrow_oop(dst, oop_index, rspec);
10314 }
10315 
10316 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
10317   assert (UseCompressedOops, "should only be used for compressed headers");
10318   assert (Universe::heap() != NULL, "java heap should be initialized");
10319   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10320   int oop_index = oop_recorder()->find_index(obj);
10321   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10322   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
10323 }
10324 
10325 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
10326   assert (UseCompressedOops, "should only be used for compressed headers");
10327   assert (Universe::heap() != NULL, "java heap should be initialized");
10328   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10329   int oop_index = oop_recorder()->find_index(obj);
10330   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10331   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
10332 }
10333 
10334 void MacroAssembler::reinit_heapbase() {
10335   if (UseCompressedOops) {
10336     movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
10337   }
10338 }
10339 #endif // _LP64
10340 
10341 
10342 // C2 compiled method's prolog code.
10343 void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
10344 
10345   // WARNING: Initial instruction MUST be 5 bytes or longer so that
10346   // NativeJump::patch_verified_entry will be able to patch out the entry
10347   // code safely. The push to verify stack depth is ok at 5 bytes,
10348   // the frame allocation can be either 3 or 6 bytes. So if we don't do
10349   // stack bang then we must use the 6 byte frame allocation even if
10350   // we have no frame. :-(
10351 
10352   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
10353   // Remove word for return addr
10354   framesize -= wordSize;
10355 
10356   // Calls to C2R adapters often do not accept exceptional returns.
10357   // We require that their callers must bang for them.  But be careful, because
10358   // some VM calls (such as call site linkage) can use several kilobytes of
10359   // stack.  But the stack safety zone should account for that.
10360   // See bugs 4446381, 4468289, 4497237.
10361   if (stack_bang) {
10362     generate_stack_overflow_check(framesize);
10363 
10364     // We always push rbp, so that on return to interpreter rbp, will be
10365     // restored correctly and we can correct the stack.
10366     push(rbp);
10367     // Remove word for ebp
10368     framesize -= wordSize;
10369 
10370     // Create frame
10371     if (framesize) {
10372       subptr(rsp, framesize);
10373     }
10374   } else {
10375     // Create frame (force generation of a 4 byte immediate value)
10376     subptr_imm32(rsp, framesize);
10377 
10378     // Save RBP register now.
10379     framesize -= wordSize;
10380     movptr(Address(rsp, framesize), rbp);
10381   }
10382 
10383   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
10384     framesize -= wordSize;
10385     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
10386   }
10387 
10388 #ifndef _LP64
10389   // If method sets FPU control word do it now
10390   if (fp_mode_24b) {
10391     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
10392   }
10393   if (UseSSE >= 2 && VerifyFPU) {
10394     verify_FPU(0, "FPU stack must be clean on entry");
10395   }
10396 #endif
10397 
10398 #ifdef ASSERT
10399   if (VerifyStackAtCalls) {
10400     Label L;
10401     push(rax);
10402     mov(rax, rsp);
10403     andptr(rax, StackAlignmentInBytes-1);
10404     cmpptr(rax, StackAlignmentInBytes-wordSize);
10405     pop(rax);
10406     jcc(Assembler::equal, L);
10407     STOP("Stack is not properly aligned!");
10408     bind(L);
10409   }
10410 #endif
10411 
10412 }
10413 
10414 
10415 // IndexOf for constant substrings with size >= 8 chars
10416 // which don't need to be loaded through stack.
10417 void MacroAssembler::string_indexofC8(Register str1, Register str2,
10418                                       Register cnt1, Register cnt2,
10419                                       int int_cnt2,  Register result,
10420                                       XMMRegister vec, Register tmp) {
10421   ShortBranchVerifier sbv(this);
10422   assert(UseSSE42Intrinsics, "SSE4.2 is required");
10423 
10424   // This method uses pcmpestri inxtruction with bound registers
10425   //   inputs:
10426   //     xmm - substring
10427   //     rax - substring length (elements count)
10428   //     mem - scanned string
10429   //     rdx - string length (elements count)
10430   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10431   //   outputs:
10432   //     rcx - matched index in string
10433   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10434 
10435   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
10436         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
10437         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
10438 
10439   // Note, inline_string_indexOf() generates checks:
10440   // if (substr.count > string.count) return -1;
10441   // if (substr.count == 0) return 0;
10442   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
10443 
10444   // Load substring.
10445   movdqu(vec, Address(str2, 0));
10446   movl(cnt2, int_cnt2);
10447   movptr(result, str1); // string addr
10448 
10449   if (int_cnt2 > 8) {
10450     jmpb(SCAN_TO_SUBSTR);
10451 
10452     // Reload substr for rescan, this code
10453     // is executed only for large substrings (> 8 chars)
10454     bind(RELOAD_SUBSTR);
10455     movdqu(vec, Address(str2, 0));
10456     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
10457 
10458     bind(RELOAD_STR);
10459     // We came here after the beginning of the substring was
10460     // matched but the rest of it was not so we need to search
10461     // again. Start from the next element after the previous match.
10462 
10463     // cnt2 is number of substring reminding elements and
10464     // cnt1 is number of string reminding elements when cmp failed.
10465     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
10466     subl(cnt1, cnt2);
10467     addl(cnt1, int_cnt2);
10468     movl(cnt2, int_cnt2); // Now restore cnt2
10469 
10470     decrementl(cnt1);     // Shift to next element
10471     cmpl(cnt1, cnt2);
10472     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10473 
10474     addptr(result, 2);
10475 
10476   } // (int_cnt2 > 8)
10477 
10478   // Scan string for start of substr in 16-byte vectors
10479   bind(SCAN_TO_SUBSTR);
10480   pcmpestri(vec, Address(result, 0), 0x0d);
10481   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10482   subl(cnt1, 8);
10483   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10484   cmpl(cnt1, cnt2);
10485   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10486   addptr(result, 16);
10487   jmpb(SCAN_TO_SUBSTR);
10488 
10489   // Found a potential substr
10490   bind(FOUND_CANDIDATE);
10491   // Matched whole vector if first element matched (tmp(rcx) == 0).
10492   if (int_cnt2 == 8) {
10493     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
10494   } else { // int_cnt2 > 8
10495     jccb(Assembler::overflow, FOUND_SUBSTR);
10496   }
10497   // After pcmpestri tmp(rcx) contains matched element index
10498   // Compute start addr of substr
10499   lea(result, Address(result, tmp, Address::times_2));
10500 
10501   // Make sure string is still long enough
10502   subl(cnt1, tmp);
10503   cmpl(cnt1, cnt2);
10504   if (int_cnt2 == 8) {
10505     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10506   } else { // int_cnt2 > 8
10507     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
10508   }
10509   // Left less then substring.
10510 
10511   bind(RET_NOT_FOUND);
10512   movl(result, -1);
10513   jmpb(EXIT);
10514 
10515   if (int_cnt2 > 8) {
10516     // This code is optimized for the case when whole substring
10517     // is matched if its head is matched.
10518     bind(MATCH_SUBSTR_HEAD);
10519     pcmpestri(vec, Address(result, 0), 0x0d);
10520     // Reload only string if does not match
10521     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
10522 
10523     Label CONT_SCAN_SUBSTR;
10524     // Compare the rest of substring (> 8 chars).
10525     bind(FOUND_SUBSTR);
10526     // First 8 chars are already matched.
10527     negptr(cnt2);
10528     addptr(cnt2, 8);
10529 
10530     bind(SCAN_SUBSTR);
10531     subl(cnt1, 8);
10532     cmpl(cnt2, -8); // Do not read beyond substring
10533     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
10534     // Back-up strings to avoid reading beyond substring:
10535     // cnt1 = cnt1 - cnt2 + 8
10536     addl(cnt1, cnt2); // cnt2 is negative
10537     addl(cnt1, 8);
10538     movl(cnt2, 8); negptr(cnt2);
10539     bind(CONT_SCAN_SUBSTR);
10540     if (int_cnt2 < (int)G) {
10541       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
10542       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
10543     } else {
10544       // calculate index in register to avoid integer overflow (int_cnt2*2)
10545       movl(tmp, int_cnt2);
10546       addptr(tmp, cnt2);
10547       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
10548       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
10549     }
10550     // Need to reload strings pointers if not matched whole vector
10551     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10552     addptr(cnt2, 8);
10553     jcc(Assembler::negative, SCAN_SUBSTR);
10554     // Fall through if found full substring
10555 
10556   } // (int_cnt2 > 8)
10557 
10558   bind(RET_FOUND);
10559   // Found result if we matched full small substring.
10560   // Compute substr offset
10561   subptr(result, str1);
10562   shrl(result, 1); // index
10563   bind(EXIT);
10564 
10565 } // string_indexofC8
10566 
10567 // Small strings are loaded through stack if they cross page boundary.
10568 void MacroAssembler::string_indexof(Register str1, Register str2,
10569                                     Register cnt1, Register cnt2,
10570                                     int int_cnt2,  Register result,
10571                                     XMMRegister vec, Register tmp) {
10572   ShortBranchVerifier sbv(this);
10573   assert(UseSSE42Intrinsics, "SSE4.2 is required");
10574   //
10575   // int_cnt2 is length of small (< 8 chars) constant substring
10576   // or (-1) for non constant substring in which case its length
10577   // is in cnt2 register.
10578   //
10579   // Note, inline_string_indexOf() generates checks:
10580   // if (substr.count > string.count) return -1;
10581   // if (substr.count == 0) return 0;
10582   //
10583   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
10584 
10585   // This method uses pcmpestri inxtruction with bound registers
10586   //   inputs:
10587   //     xmm - substring
10588   //     rax - substring length (elements count)
10589   //     mem - scanned string
10590   //     rdx - string length (elements count)
10591   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10592   //   outputs:
10593   //     rcx - matched index in string
10594   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10595 
10596   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
10597         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
10598         FOUND_CANDIDATE;
10599 
10600   { //========================================================
10601     // We don't know where these strings are located
10602     // and we can't read beyond them. Load them through stack.
10603     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
10604 
10605     movptr(tmp, rsp); // save old SP
10606 
10607     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
10608       if (int_cnt2 == 1) {  // One char
10609         load_unsigned_short(result, Address(str2, 0));
10610         movdl(vec, result); // move 32 bits
10611       } else if (int_cnt2 == 2) { // Two chars
10612         movdl(vec, Address(str2, 0)); // move 32 bits
10613       } else if (int_cnt2 == 4) { // Four chars
10614         movq(vec, Address(str2, 0));  // move 64 bits
10615       } else { // cnt2 = { 3, 5, 6, 7 }
10616         // Array header size is 12 bytes in 32-bit VM
10617         // + 6 bytes for 3 chars == 18 bytes,
10618         // enough space to load vec and shift.
10619         assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
10620         movdqu(vec, Address(str2, (int_cnt2*2)-16));
10621         psrldq(vec, 16-(int_cnt2*2));
10622       }
10623     } else { // not constant substring
10624       cmpl(cnt2, 8);
10625       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
10626 
10627       // We can read beyond string if srt+16 does not cross page boundary
10628       // since heaps are aligned and mapped by pages.
10629       assert(os::vm_page_size() < (int)G, "default page should be small");
10630       movl(result, str2); // We need only low 32 bits
10631       andl(result, (os::vm_page_size()-1));
10632       cmpl(result, (os::vm_page_size()-16));
10633       jccb(Assembler::belowEqual, CHECK_STR);
10634 
10635       // Move small strings to stack to allow load 16 bytes into vec.
10636       subptr(rsp, 16);
10637       int stk_offset = wordSize-2;
10638       push(cnt2);
10639 
10640       bind(COPY_SUBSTR);
10641       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
10642       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10643       decrement(cnt2);
10644       jccb(Assembler::notZero, COPY_SUBSTR);
10645 
10646       pop(cnt2);
10647       movptr(str2, rsp);  // New substring address
10648     } // non constant
10649 
10650     bind(CHECK_STR);
10651     cmpl(cnt1, 8);
10652     jccb(Assembler::aboveEqual, BIG_STRINGS);
10653 
10654     // Check cross page boundary.
10655     movl(result, str1); // We need only low 32 bits
10656     andl(result, (os::vm_page_size()-1));
10657     cmpl(result, (os::vm_page_size()-16));
10658     jccb(Assembler::belowEqual, BIG_STRINGS);
10659 
10660     subptr(rsp, 16);
10661     int stk_offset = -2;
10662     if (int_cnt2 < 0) { // not constant
10663       push(cnt2);
10664       stk_offset += wordSize;
10665     }
10666     movl(cnt2, cnt1);
10667 
10668     bind(COPY_STR);
10669     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
10670     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10671     decrement(cnt2);
10672     jccb(Assembler::notZero, COPY_STR);
10673 
10674     if (int_cnt2 < 0) { // not constant
10675       pop(cnt2);
10676     }
10677     movptr(str1, rsp);  // New string address
10678 
10679     bind(BIG_STRINGS);
10680     // Load substring.
10681     if (int_cnt2 < 0) { // -1
10682       movdqu(vec, Address(str2, 0));
10683       push(cnt2);       // substr count
10684       push(str2);       // substr addr
10685       push(str1);       // string addr
10686     } else {
10687       // Small (< 8 chars) constant substrings are loaded already.
10688       movl(cnt2, int_cnt2);
10689     }
10690     push(tmp);  // original SP
10691 
10692   } // Finished loading
10693 
10694   //========================================================
10695   // Start search
10696   //
10697 
10698   movptr(result, str1); // string addr
10699 
10700   if (int_cnt2  < 0) {  // Only for non constant substring
10701     jmpb(SCAN_TO_SUBSTR);
10702 
10703     // SP saved at sp+0
10704     // String saved at sp+1*wordSize
10705     // Substr saved at sp+2*wordSize
10706     // Substr count saved at sp+3*wordSize
10707 
10708     // Reload substr for rescan, this code
10709     // is executed only for large substrings (> 8 chars)
10710     bind(RELOAD_SUBSTR);
10711     movptr(str2, Address(rsp, 2*wordSize));
10712     movl(cnt2, Address(rsp, 3*wordSize));
10713     movdqu(vec, Address(str2, 0));
10714     // We came here after the beginning of the substring was
10715     // matched but the rest of it was not so we need to search
10716     // again. Start from the next element after the previous match.
10717     subptr(str1, result); // Restore counter
10718     shrl(str1, 1);
10719     addl(cnt1, str1);
10720     decrementl(cnt1);   // Shift to next element
10721     cmpl(cnt1, cnt2);
10722     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10723 
10724     addptr(result, 2);
10725   } // non constant
10726 
10727   // Scan string for start of substr in 16-byte vectors
10728   bind(SCAN_TO_SUBSTR);
10729   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10730   pcmpestri(vec, Address(result, 0), 0x0d);
10731   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10732   subl(cnt1, 8);
10733   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10734   cmpl(cnt1, cnt2);
10735   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10736   addptr(result, 16);
10737 
10738   bind(ADJUST_STR);
10739   cmpl(cnt1, 8); // Do not read beyond string
10740   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10741   // Back-up string to avoid reading beyond string.
10742   lea(result, Address(result, cnt1, Address::times_2, -16));
10743   movl(cnt1, 8);
10744   jmpb(SCAN_TO_SUBSTR);
10745 
10746   // Found a potential substr
10747   bind(FOUND_CANDIDATE);
10748   // After pcmpestri tmp(rcx) contains matched element index
10749 
10750   // Make sure string is still long enough
10751   subl(cnt1, tmp);
10752   cmpl(cnt1, cnt2);
10753   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
10754   // Left less then substring.
10755 
10756   bind(RET_NOT_FOUND);
10757   movl(result, -1);
10758   jmpb(CLEANUP);
10759 
10760   bind(FOUND_SUBSTR);
10761   // Compute start addr of substr
10762   lea(result, Address(result, tmp, Address::times_2));
10763 
10764   if (int_cnt2 > 0) { // Constant substring
10765     // Repeat search for small substring (< 8 chars)
10766     // from new point without reloading substring.
10767     // Have to check that we don't read beyond string.
10768     cmpl(tmp, 8-int_cnt2);
10769     jccb(Assembler::greater, ADJUST_STR);
10770     // Fall through if matched whole substring.
10771   } else { // non constant
10772     assert(int_cnt2 == -1, "should be != 0");
10773 
10774     addl(tmp, cnt2);
10775     // Found result if we matched whole substring.
10776     cmpl(tmp, 8);
10777     jccb(Assembler::lessEqual, RET_FOUND);
10778 
10779     // Repeat search for small substring (<= 8 chars)
10780     // from new point 'str1' without reloading substring.
10781     cmpl(cnt2, 8);
10782     // Have to check that we don't read beyond string.
10783     jccb(Assembler::lessEqual, ADJUST_STR);
10784 
10785     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
10786     // Compare the rest of substring (> 8 chars).
10787     movptr(str1, result);
10788 
10789     cmpl(tmp, cnt2);
10790     // First 8 chars are already matched.
10791     jccb(Assembler::equal, CHECK_NEXT);
10792 
10793     bind(SCAN_SUBSTR);
10794     pcmpestri(vec, Address(str1, 0), 0x0d);
10795     // Need to reload strings pointers if not matched whole vector
10796     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10797 
10798     bind(CHECK_NEXT);
10799     subl(cnt2, 8);
10800     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
10801     addptr(str1, 16);
10802     addptr(str2, 16);
10803     subl(cnt1, 8);
10804     cmpl(cnt2, 8); // Do not read beyond substring
10805     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
10806     // Back-up strings to avoid reading beyond substring.
10807     lea(str2, Address(str2, cnt2, Address::times_2, -16));
10808     lea(str1, Address(str1, cnt2, Address::times_2, -16));
10809     subl(cnt1, cnt2);
10810     movl(cnt2, 8);
10811     addl(cnt1, 8);
10812     bind(CONT_SCAN_SUBSTR);
10813     movdqu(vec, Address(str2, 0));
10814     jmpb(SCAN_SUBSTR);
10815 
10816     bind(RET_FOUND_LONG);
10817     movptr(str1, Address(rsp, wordSize));
10818   } // non constant
10819 
10820   bind(RET_FOUND);
10821   // Compute substr offset
10822   subptr(result, str1);
10823   shrl(result, 1); // index
10824 
10825   bind(CLEANUP);
10826   pop(rsp); // restore SP
10827 
10828 } // string_indexof
10829 
10830 // Compare strings.
10831 void MacroAssembler::string_compare(Register str1, Register str2,
10832                                     Register cnt1, Register cnt2, Register result,
10833                                     XMMRegister vec1) {
10834   ShortBranchVerifier sbv(this);
10835   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
10836 
10837   // Compute the minimum of the string lengths and the
10838   // difference of the string lengths (stack).
10839   // Do the conditional move stuff
10840   movl(result, cnt1);
10841   subl(cnt1, cnt2);
10842   push(cnt1);
10843   cmov32(Assembler::lessEqual, cnt2, result);
10844 
10845   // Is the minimum length zero?
10846   testl(cnt2, cnt2);
10847   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10848 
10849   // Load first characters
10850   load_unsigned_short(result, Address(str1, 0));
10851   load_unsigned_short(cnt1, Address(str2, 0));
10852 
10853   // Compare first characters
10854   subl(result, cnt1);
10855   jcc(Assembler::notZero,  POP_LABEL);
10856   decrementl(cnt2);
10857   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10858 
10859   {
10860     // Check after comparing first character to see if strings are equivalent
10861     Label LSkip2;
10862     // Check if the strings start at same location
10863     cmpptr(str1, str2);
10864     jccb(Assembler::notEqual, LSkip2);
10865 
10866     // Check if the length difference is zero (from stack)
10867     cmpl(Address(rsp, 0), 0x0);
10868     jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
10869 
10870     // Strings might not be equivalent
10871     bind(LSkip2);
10872   }
10873 
10874   Address::ScaleFactor scale = Address::times_2;
10875   int stride = 8;
10876 
10877   // Advance to next element
10878   addptr(str1, 16/stride);
10879   addptr(str2, 16/stride);
10880 
10881   if (UseSSE42Intrinsics) {
10882     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
10883     int pcmpmask = 0x19;
10884     // Setup to compare 16-byte vectors
10885     movl(result, cnt2);
10886     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
10887     jccb(Assembler::zero, COMPARE_TAIL);
10888 
10889     lea(str1, Address(str1, result, scale));
10890     lea(str2, Address(str2, result, scale));
10891     negptr(result);
10892 
10893     // pcmpestri
10894     //   inputs:
10895     //     vec1- substring
10896     //     rax - negative string length (elements count)
10897     //     mem - scaned string
10898     //     rdx - string length (elements count)
10899     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
10900     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
10901     //   outputs:
10902     //     rcx - first mismatched element index
10903     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
10904 
10905     bind(COMPARE_WIDE_VECTORS);
10906     movdqu(vec1, Address(str1, result, scale));
10907     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10908     // After pcmpestri cnt1(rcx) contains mismatched element index
10909 
10910     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
10911     addptr(result, stride);
10912     subptr(cnt2, stride);
10913     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
10914 
10915     // compare wide vectors tail
10916     testl(result, result);
10917     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
10918 
10919     movl(cnt2, stride);
10920     movl(result, stride);
10921     negptr(result);
10922     movdqu(vec1, Address(str1, result, scale));
10923     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10924     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
10925 
10926     // Mismatched characters in the vectors
10927     bind(VECTOR_NOT_EQUAL);
10928     addptr(result, cnt1);
10929     movptr(cnt2, result);
10930     load_unsigned_short(result, Address(str1, cnt2, scale));
10931     load_unsigned_short(cnt1, Address(str2, cnt2, scale));
10932     subl(result, cnt1);
10933     jmpb(POP_LABEL);
10934 
10935     bind(COMPARE_TAIL); // limit is zero
10936     movl(cnt2, result);
10937     // Fallthru to tail compare
10938   }
10939 
10940   // Shift str2 and str1 to the end of the arrays, negate min
10941   lea(str1, Address(str1, cnt2, scale, 0));
10942   lea(str2, Address(str2, cnt2, scale, 0));
10943   negptr(cnt2);
10944 
10945   // Compare the rest of the elements
10946   bind(WHILE_HEAD_LABEL);
10947   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
10948   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
10949   subl(result, cnt1);
10950   jccb(Assembler::notZero, POP_LABEL);
10951   increment(cnt2);
10952   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
10953 
10954   // Strings are equal up to min length.  Return the length difference.
10955   bind(LENGTH_DIFF_LABEL);
10956   pop(result);
10957   jmpb(DONE_LABEL);
10958 
10959   // Discard the stored length difference
10960   bind(POP_LABEL);
10961   pop(cnt1);
10962 
10963   // That's it
10964   bind(DONE_LABEL);
10965 }
10966 
10967 // Compare char[] arrays aligned to 4 bytes or substrings.
10968 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
10969                                         Register limit, Register result, Register chr,
10970                                         XMMRegister vec1, XMMRegister vec2) {
10971   ShortBranchVerifier sbv(this);
10972   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
10973 
10974   int length_offset  = arrayOopDesc::length_offset_in_bytes();
10975   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
10976 
10977   // Check the input args
10978   cmpptr(ary1, ary2);
10979   jcc(Assembler::equal, TRUE_LABEL);
10980 
10981   if (is_array_equ) {
10982     // Need additional checks for arrays_equals.
10983     testptr(ary1, ary1);
10984     jcc(Assembler::zero, FALSE_LABEL);
10985     testptr(ary2, ary2);
10986     jcc(Assembler::zero, FALSE_LABEL);
10987 
10988     // Check the lengths
10989     movl(limit, Address(ary1, length_offset));
10990     cmpl(limit, Address(ary2, length_offset));
10991     jcc(Assembler::notEqual, FALSE_LABEL);
10992   }
10993 
10994   // count == 0
10995   testl(limit, limit);
10996   jcc(Assembler::zero, TRUE_LABEL);
10997 
10998   if (is_array_equ) {
10999     // Load array address
11000     lea(ary1, Address(ary1, base_offset));
11001     lea(ary2, Address(ary2, base_offset));
11002   }
11003 
11004   shll(limit, 1);      // byte count != 0
11005   movl(result, limit); // copy
11006 
11007   if (UseSSE42Intrinsics) {
11008     // With SSE4.2, use double quad vector compare
11009     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
11010 
11011     // Compare 16-byte vectors
11012     andl(result, 0x0000000e);  //   tail count (in bytes)
11013     andl(limit, 0xfffffff0);   // vector count (in bytes)
11014     jccb(Assembler::zero, COMPARE_TAIL);
11015 
11016     lea(ary1, Address(ary1, limit, Address::times_1));
11017     lea(ary2, Address(ary2, limit, Address::times_1));
11018     negptr(limit);
11019 
11020     bind(COMPARE_WIDE_VECTORS);
11021     movdqu(vec1, Address(ary1, limit, Address::times_1));
11022     movdqu(vec2, Address(ary2, limit, Address::times_1));
11023     pxor(vec1, vec2);
11024 
11025     ptest(vec1, vec1);
11026     jccb(Assembler::notZero, FALSE_LABEL);
11027     addptr(limit, 16);
11028     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
11029 
11030     testl(result, result);
11031     jccb(Assembler::zero, TRUE_LABEL);
11032 
11033     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
11034     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
11035     pxor(vec1, vec2);
11036 
11037     ptest(vec1, vec1);
11038     jccb(Assembler::notZero, FALSE_LABEL);
11039     jmpb(TRUE_LABEL);
11040 
11041     bind(COMPARE_TAIL); // limit is zero
11042     movl(limit, result);
11043     // Fallthru to tail compare
11044   }
11045 
11046   // Compare 4-byte vectors
11047   andl(limit, 0xfffffffc); // vector count (in bytes)
11048   jccb(Assembler::zero, COMPARE_CHAR);
11049 
11050   lea(ary1, Address(ary1, limit, Address::times_1));
11051   lea(ary2, Address(ary2, limit, Address::times_1));
11052   negptr(limit);
11053 
11054   bind(COMPARE_VECTORS);
11055   movl(chr, Address(ary1, limit, Address::times_1));
11056   cmpl(chr, Address(ary2, limit, Address::times_1));
11057   jccb(Assembler::notEqual, FALSE_LABEL);
11058   addptr(limit, 4);
11059   jcc(Assembler::notZero, COMPARE_VECTORS);
11060 
11061   // Compare trailing char (final 2 bytes), if any
11062   bind(COMPARE_CHAR);
11063   testl(result, 0x2);   // tail  char
11064   jccb(Assembler::zero, TRUE_LABEL);
11065   load_unsigned_short(chr, Address(ary1, 0));
11066   load_unsigned_short(limit, Address(ary2, 0));
11067   cmpl(chr, limit);
11068   jccb(Assembler::notEqual, FALSE_LABEL);
11069 
11070   bind(TRUE_LABEL);
11071   movl(result, 1);   // return true
11072   jmpb(DONE);
11073 
11074   bind(FALSE_LABEL);
11075   xorl(result, result); // return false
11076 
11077   // That's it
11078   bind(DONE);
11079 }
11080 
11081 void MacroAssembler::generate_fill(BasicType t, bool aligned,
11082                                    Register to, Register value, Register count,
11083                                    Register rtmp, XMMRegister xtmp) {
11084   ShortBranchVerifier sbv(this);
11085   assert_different_registers(to, value, count, rtmp);
11086   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
11087   Label L_fill_2_bytes, L_fill_4_bytes;
11088 
11089   int shift = -1;
11090   switch (t) {
11091     case T_BYTE:
11092       shift = 2;
11093       break;
11094     case T_SHORT:
11095       shift = 1;
11096       break;
11097     case T_INT:
11098       shift = 0;
11099       break;
11100     default: ShouldNotReachHere();
11101   }
11102 
11103   if (t == T_BYTE) {
11104     andl(value, 0xff);
11105     movl(rtmp, value);
11106     shll(rtmp, 8);
11107     orl(value, rtmp);
11108   }
11109   if (t == T_SHORT) {
11110     andl(value, 0xffff);
11111   }
11112   if (t == T_BYTE || t == T_SHORT) {
11113     movl(rtmp, value);
11114     shll(rtmp, 16);
11115     orl(value, rtmp);
11116   }
11117 
11118   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
11119   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
11120   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
11121     // align source address at 4 bytes address boundary
11122     if (t == T_BYTE) {
11123       // One byte misalignment happens only for byte arrays
11124       testptr(to, 1);
11125       jccb(Assembler::zero, L_skip_align1);
11126       movb(Address(to, 0), value);
11127       increment(to);
11128       decrement(count);
11129       BIND(L_skip_align1);
11130     }
11131     // Two bytes misalignment happens only for byte and short (char) arrays
11132     testptr(to, 2);
11133     jccb(Assembler::zero, L_skip_align2);
11134     movw(Address(to, 0), value);
11135     addptr(to, 2);
11136     subl(count, 1<<(shift-1));
11137     BIND(L_skip_align2);
11138   }
11139   if (UseSSE < 2) {
11140     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
11141     // Fill 32-byte chunks
11142     subl(count, 8 << shift);
11143     jcc(Assembler::less, L_check_fill_8_bytes);
11144     align(16);
11145 
11146     BIND(L_fill_32_bytes_loop);
11147 
11148     for (int i = 0; i < 32; i += 4) {
11149       movl(Address(to, i), value);
11150     }
11151 
11152     addptr(to, 32);
11153     subl(count, 8 << shift);
11154     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
11155     BIND(L_check_fill_8_bytes);
11156     addl(count, 8 << shift);
11157     jccb(Assembler::zero, L_exit);
11158     jmpb(L_fill_8_bytes);
11159 
11160     //
11161     // length is too short, just fill qwords
11162     //
11163     BIND(L_fill_8_bytes_loop);
11164     movl(Address(to, 0), value);
11165     movl(Address(to, 4), value);
11166     addptr(to, 8);
11167     BIND(L_fill_8_bytes);
11168     subl(count, 1 << (shift + 1));
11169     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
11170     // fall through to fill 4 bytes
11171   } else {
11172     Label L_fill_32_bytes;
11173     if (!UseUnalignedLoadStores) {
11174       // align to 8 bytes, we know we are 4 byte aligned to start
11175       testptr(to, 4);
11176       jccb(Assembler::zero, L_fill_32_bytes);
11177       movl(Address(to, 0), value);
11178       addptr(to, 4);
11179       subl(count, 1<<shift);
11180     }
11181     BIND(L_fill_32_bytes);
11182     {
11183       assert( UseSSE >= 2, "supported cpu only" );
11184       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
11185       // Fill 32-byte chunks
11186       movdl(xtmp, value);
11187       pshufd(xtmp, xtmp, 0);
11188 
11189       subl(count, 8 << shift);
11190       jcc(Assembler::less, L_check_fill_8_bytes);
11191       align(16);
11192 
11193       BIND(L_fill_32_bytes_loop);
11194 
11195       if (UseUnalignedLoadStores) {
11196         movdqu(Address(to, 0), xtmp);
11197         movdqu(Address(to, 16), xtmp);
11198       } else {
11199         movq(Address(to, 0), xtmp);
11200         movq(Address(to, 8), xtmp);
11201         movq(Address(to, 16), xtmp);
11202         movq(Address(to, 24), xtmp);
11203       }
11204 
11205       addptr(to, 32);
11206       subl(count, 8 << shift);
11207       jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
11208       BIND(L_check_fill_8_bytes);
11209       addl(count, 8 << shift);
11210       jccb(Assembler::zero, L_exit);
11211       jmpb(L_fill_8_bytes);
11212 
11213       //
11214       // length is too short, just fill qwords
11215       //
11216       BIND(L_fill_8_bytes_loop);
11217       movq(Address(to, 0), xtmp);
11218       addptr(to, 8);
11219       BIND(L_fill_8_bytes);
11220       subl(count, 1 << (shift + 1));
11221       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
11222     }
11223   }
11224   // fill trailing 4 bytes
11225   BIND(L_fill_4_bytes);
11226   testl(count, 1<<shift);
11227   jccb(Assembler::zero, L_fill_2_bytes);
11228   movl(Address(to, 0), value);
11229   if (t == T_BYTE || t == T_SHORT) {
11230     addptr(to, 4);
11231     BIND(L_fill_2_bytes);
11232     // fill trailing 2 bytes
11233     testl(count, 1<<(shift-1));
11234     jccb(Assembler::zero, L_fill_byte);
11235     movw(Address(to, 0), value);
11236     if (t == T_BYTE) {
11237       addptr(to, 2);
11238       BIND(L_fill_byte);
11239       // fill trailing byte
11240       testl(count, 1);
11241       jccb(Assembler::zero, L_exit);
11242       movb(Address(to, 0), value);
11243     } else {
11244       BIND(L_fill_byte);
11245     }
11246   } else {
11247     BIND(L_fill_2_bytes);
11248   }
11249   BIND(L_exit);
11250 }
11251 #undef BIND
11252 #undef BLOCK_COMMENT
11253 
11254 
11255 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11256   switch (cond) {
11257     // Note some conditions are synonyms for others
11258     case Assembler::zero:         return Assembler::notZero;
11259     case Assembler::notZero:      return Assembler::zero;
11260     case Assembler::less:         return Assembler::greaterEqual;
11261     case Assembler::lessEqual:    return Assembler::greater;
11262     case Assembler::greater:      return Assembler::lessEqual;
11263     case Assembler::greaterEqual: return Assembler::less;
11264     case Assembler::below:        return Assembler::aboveEqual;
11265     case Assembler::belowEqual:   return Assembler::above;
11266     case Assembler::above:        return Assembler::belowEqual;
11267     case Assembler::aboveEqual:   return Assembler::below;
11268     case Assembler::overflow:     return Assembler::noOverflow;
11269     case Assembler::noOverflow:   return Assembler::overflow;
11270     case Assembler::negative:     return Assembler::positive;
11271     case Assembler::positive:     return Assembler::negative;
11272     case Assembler::parity:       return Assembler::noParity;
11273     case Assembler::noParity:     return Assembler::parity;
11274   }
11275   ShouldNotReachHere(); return Assembler::overflow;
11276 }
11277 
11278 SkipIfEqual::SkipIfEqual(
11279     MacroAssembler* masm, const bool* flag_addr, bool value) {
11280   _masm = masm;
11281   _masm->cmp8(ExternalAddress((address)flag_addr), value);
11282   _masm->jcc(Assembler::equal, _label);
11283 }
11284 
11285 SkipIfEqual::~SkipIfEqual() {
11286   _masm->bind(_label);
11287 }