1 /*
   2  * Copyright (c) 2009, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 package com.oracle.graal.asm.amd64;
  24 
  25 import static com.oracle.graal.amd64.AMD64.*;
  26 import static com.oracle.graal.api.code.MemoryBarriers.*;
  27 import static com.oracle.graal.asm.NumUtil.*;
  28 import static com.oracle.graal.asm.amd64.AMD64AsmOptions.*;
  29 
  30 import com.oracle.graal.amd64.*;
  31 import com.oracle.graal.amd64.AMD64.CPUFeature;
  32 import com.oracle.graal.api.code.*;
  33 import com.oracle.graal.asm.*;
  34 
  35 /**
  36  * This class implements an assembler that can encode most X86 instructions.
  37  */
  38 public class AMD64Assembler extends Assembler {
  39 
  40     private static final int MinEncodingNeedsRex = 8;
  41 
  42     /**
  43      * A sentinel value used as a place holder in an instruction stream for an address that will be
  44      * patched.
  45      */
  46     private static final AMD64Address Placeholder = new AMD64Address(rip);
  47 
  48     /**
  49      * The x86 condition codes used for conditional jumps/moves.
  50      */
  51     public enum ConditionFlag {
  52         Zero(0x4, "|zero|"),
  53         NotZero(0x5, "|nzero|"),
  54         Equal(0x4, "="),
  55         NotEqual(0x5, "!="),
  56         Less(0xc, "<"),
  57         LessEqual(0xe, "<="),
  58         Greater(0xf, ">"),
  59         GreaterEqual(0xd, ">="),
  60         Below(0x2, "|<|"),
  61         BelowEqual(0x6, "|<=|"),
  62         Above(0x7, "|>|"),
  63         AboveEqual(0x3, "|>=|"),
  64         Overflow(0x0, "|of|"),
  65         NoOverflow(0x1, "|nof|"),
  66         CarrySet(0x2, "|carry|"),
  67         CarryClear(0x3, "|ncarry|"),
  68         Negative(0x8, "|neg|"),
  69         Positive(0x9, "|pos|"),
  70         Parity(0xa, "|par|"),
  71         NoParity(0xb, "|npar|");
  72 
  73         private final int value;
  74         private final String operator;
  75 
  76         private ConditionFlag(int value, String operator) {
  77             this.value = value;
  78             this.operator = operator;
  79         }
  80 
  81         public ConditionFlag negate() {
  82             switch (this) {
  83                 case Zero:
  84                     return NotZero;
  85                 case NotZero:
  86                     return Zero;
  87                 case Equal:
  88                     return NotEqual;
  89                 case NotEqual:
  90                     return Equal;
  91                 case Less:
  92                     return GreaterEqual;
  93                 case LessEqual:
  94                     return Greater;
  95                 case Greater:
  96                     return LessEqual;
  97                 case GreaterEqual:
  98                     return Less;
  99                 case Below:
 100                     return AboveEqual;
 101                 case BelowEqual:
 102                     return Above;
 103                 case Above:
 104                     return BelowEqual;
 105                 case AboveEqual:
 106                     return Below;
 107                 case Overflow:
 108                     return NoOverflow;
 109                 case NoOverflow:
 110                     return Overflow;
 111                 case CarrySet:
 112                     return CarryClear;
 113                 case CarryClear:
 114                     return CarrySet;
 115                 case Negative:
 116                     return Positive;
 117                 case Positive:
 118                     return Negative;
 119                 case Parity:
 120                     return NoParity;
 121                 case NoParity:
 122                     return Parity;
 123             }
 124             throw new IllegalArgumentException();
 125         }
 126 
 127         public int getValue() {
 128             return value;
 129         }
 130 
 131         @Override
 132         public String toString() {
 133             return operator;
 134         }
 135     }
 136 
 137     /**
 138      * Constants for X86 prefix bytes.
 139      */
 140     private static class Prefix {
 141 
 142         private static final int REX = 0x40;
 143         private static final int REXB = 0x41;
 144         private static final int REXX = 0x42;
 145         private static final int REXXB = 0x43;
 146         private static final int REXR = 0x44;
 147         private static final int REXRB = 0x45;
 148         private static final int REXRX = 0x46;
 149         private static final int REXRXB = 0x47;
 150         private static final int REXW = 0x48;
 151         private static final int REXWB = 0x49;
 152         private static final int REXWX = 0x4A;
 153         private static final int REXWXB = 0x4B;
 154         private static final int REXWR = 0x4C;
 155         private static final int REXWRB = 0x4D;
 156         private static final int REXWRX = 0x4E;
 157         private static final int REXWRXB = 0x4F;
 158     }
 159 
 160     /**
 161      * The register to which {@link Register#Frame} and {@link Register#CallerFrame} are bound.
 162      */
 163     public final Register frameRegister;
 164 
 165     /**
 166      * Constructs an assembler for the AMD64 architecture.
 167      *
 168      * @param registerConfig the register configuration used to bind {@link Register#Frame} and
 169      *            {@link Register#CallerFrame} to physical registers. This value can be null if this
 170      *            assembler instance will not be used to assemble instructions using these logical
 171      *            registers.
 172      */
 173     public AMD64Assembler(TargetDescription target, RegisterConfig registerConfig) {
 174         super(target);
 175         this.frameRegister = registerConfig == null ? null : registerConfig.getFrameRegister();
 176     }
 177 
 178     private boolean supports(CPUFeature feature) {
 179         return ((AMD64) target.arch).getFeatures().contains(feature);
 180     }
 181 
 182     private static int encode(Register r) {
 183         assert r.encoding < 16 && r.encoding >= 0 : "encoding out of range: " + r.encoding;
 184         return r.encoding & 0x7;
 185     }
 186 
 187     private void emitArithImm8(int op, Register dst, int imm8) {
 188         int encode = prefixAndEncode(op, false, dst.encoding, true);
 189         emitByte(0x80);
 190         emitByte(0xC0 | encode);
 191         emitByte(imm8);
 192     }
 193 
 194     private void emitArithImm16(int op, Register dst, int imm16) {
 195         emitByte(0x66);
 196         int encode = prefixAndEncode(op, dst.encoding);
 197         if (isByte(imm16)) {
 198             emitByte(0x83); // imm8 sign extend
 199             emitByte(0xC0 | encode);
 200             emitByte(imm16 & 0xFF);
 201         } else {
 202             emitByte(0x81);
 203             emitByte(0xC0 | encode);
 204             emitShort(imm16);
 205         }
 206     }
 207 
 208     private void emitArithImm32(int op, Register dst, int imm32) {
 209         int encode = prefixAndEncode(op, dst.encoding);
 210         if (isByte(imm32)) {
 211             emitByte(0x83); // imm8 sign extend
 212             emitByte(0xC0 | encode);
 213             emitByte(imm32 & 0xFF);
 214         } else {
 215             emitByte(0x81);
 216             emitByte(0xC0 | encode);
 217             emitInt(imm32);
 218         }
 219     }
 220 
 221     private void emitArithImm32q(int op, Register dst, int imm32) {
 222         emitArithImm32q(op, dst, imm32, false);
 223     }
 224 
 225     private void emitArithImm32q(int op, Register dst, int imm32, boolean force32Imm) {
 226         int encode = prefixqAndEncode(op, dst.encoding);
 227         if (isByte(imm32) && !force32Imm) {
 228             emitByte(0x83); // imm8 sign extend
 229             emitByte(0xC0 | encode);
 230             emitByte(imm32 & 0xFF);
 231         } else {
 232             emitByte(0x81);
 233             emitByte(0xC0 | encode);
 234             emitInt(imm32);
 235         }
 236     }
 237 
 238     // immediate-to-memory forms
 239     private void emitArithImm8(int op, AMD64Address adr, int imm8) {
 240         prefix(adr);
 241         emitByte(0x80);
 242         emitOperandHelper(op, adr);
 243         emitByte(imm8);
 244     }
 245 
 246     private void emitArithImm16(int op, AMD64Address adr, int imm16) {
 247         emitByte(0x66);
 248         prefix(adr);
 249         if (isByte(imm16)) {
 250             emitByte(0x83); // imm8 sign extend
 251             emitOperandHelper(op, adr);
 252             emitByte(imm16 & 0xFF);
 253         } else {
 254             emitByte(0x81);
 255             emitOperandHelper(op, adr);
 256             emitShort(imm16);
 257         }
 258     }
 259 
 260     private void emitArithImm32(int op, AMD64Address adr, int imm32) {
 261         prefix(adr);
 262         if (isByte(imm32)) {
 263             emitByte(0x83); // imm8 sign extend
 264             emitOperandHelper(op, adr);
 265             emitByte(imm32 & 0xFF);
 266         } else {
 267             emitByte(0x81);
 268             emitOperandHelper(op, adr);
 269             emitInt(imm32);
 270         }
 271     }
 272 
 273     protected void emitOperandHelper(Register reg, AMD64Address addr) {
 274         assert !reg.equals(Register.None);
 275         emitOperandHelper(encode(reg), addr);
 276     }
 277 
 278     protected void emitOperandHelper(int reg, AMD64Address addr) {
 279         assert (reg & 0x07) == reg;
 280         int regenc = reg << 3;
 281 
 282         Register base = addr.getBase();
 283         Register index = addr.getIndex();
 284 
 285         AMD64Address.Scale scale = addr.getScale();
 286         int disp = addr.getDisplacement();
 287 
 288         if (base.equals(Register.Frame)) {
 289             assert frameRegister != null : "cannot use register " + Register.Frame + " in assembler with null register configuration";
 290             base = frameRegister;
 291         }
 292 
 293         if (base.equals(AMD64.rip)) { // also matches Placeholder
 294             // [00 000 101] disp32
 295             assert index.equals(Register.None) : "cannot use RIP relative addressing with index register";
 296             emitByte(0x05 | regenc);
 297             emitInt(disp);
 298         } else if (base.isValid()) {
 299             int baseenc = base.isValid() ? encode(base) : 0;
 300             if (index.isValid()) {
 301                 int indexenc = encode(index) << 3;
 302                 // [base + indexscale + disp]
 303                 if (disp == 0 && !base.equals(rbp) && !base.equals(r13)) {
 304                     // [base + indexscale]
 305                     // [00 reg 100][ss index base]
 306                     assert !index.equals(rsp) : "illegal addressing mode";
 307                     emitByte(0x04 | regenc);
 308                     emitByte(scale.log2 << 6 | indexenc | baseenc);
 309                 } else if (isByte(disp)) {
 310                     // [base + indexscale + imm8]
 311                     // [01 reg 100][ss index base] imm8
 312                     assert !index.equals(rsp) : "illegal addressing mode";
 313                     emitByte(0x44 | regenc);
 314                     emitByte(scale.log2 << 6 | indexenc | baseenc);
 315                     emitByte(disp & 0xFF);
 316                 } else {
 317                     // [base + indexscale + disp32]
 318                     // [10 reg 100][ss index base] disp32
 319                     assert !index.equals(rsp) : "illegal addressing mode";
 320                     emitByte(0x84 | regenc);
 321                     emitByte(scale.log2 << 6 | indexenc | baseenc);
 322                     emitInt(disp);
 323                 }
 324             } else if (base.equals(rsp) || base.equals(r12)) {
 325                 // [rsp + disp]
 326                 if (disp == 0) {
 327                     // [rsp]
 328                     // [00 reg 100][00 100 100]
 329                     emitByte(0x04 | regenc);
 330                     emitByte(0x24);
 331                 } else if (isByte(disp)) {
 332                     // [rsp + imm8]
 333                     // [01 reg 100][00 100 100] disp8
 334                     emitByte(0x44 | regenc);
 335                     emitByte(0x24);
 336                     emitByte(disp & 0xFF);
 337                 } else {
 338                     // [rsp + imm32]
 339                     // [10 reg 100][00 100 100] disp32
 340                     emitByte(0x84 | regenc);
 341                     emitByte(0x24);
 342                     emitInt(disp);
 343                 }
 344             } else {
 345                 // [base + disp]
 346                 assert !base.equals(rsp) && !base.equals(r12) : "illegal addressing mode";
 347                 if (disp == 0 && !base.equals(rbp) && !base.equals(r13)) {
 348                     // [base]
 349                     // [00 reg base]
 350                     emitByte(0x00 | regenc | baseenc);
 351                 } else if (isByte(disp)) {
 352                     // [base + disp8]
 353                     // [01 reg base] disp8
 354                     emitByte(0x40 | regenc | baseenc);
 355                     emitByte(disp & 0xFF);
 356                 } else {
 357                     // [base + disp32]
 358                     // [10 reg base] disp32
 359                     emitByte(0x80 | regenc | baseenc);
 360                     emitInt(disp);
 361                 }
 362             }
 363         } else {
 364             if (index.isValid()) {
 365                 int indexenc = encode(index) << 3;
 366                 // [indexscale + disp]
 367                 // [00 reg 100][ss index 101] disp32
 368                 assert !index.equals(rsp) : "illegal addressing mode";
 369                 emitByte(0x04 | regenc);
 370                 emitByte(scale.log2 << 6 | indexenc | 0x05);
 371                 emitInt(disp);
 372             } else {
 373                 // [disp] ABSOLUTE
 374                 // [00 reg 100][00 100 101] disp32
 375                 emitByte(0x04 | regenc);
 376                 emitByte(0x25);
 377                 emitInt(disp);
 378             }
 379         }
 380     }
 381 
 382     public final void addl(AMD64Address dst, int imm32) {
 383         emitArithImm32(0, dst, imm32);
 384     }
 385 
 386     public final void addl(Register dst, int imm32) {
 387         emitArithImm32(0, dst, imm32);
 388     }
 389 
 390     public final void addl(Register dst, AMD64Address src) {
 391         prefix(src, dst);
 392         emitByte(0x03);
 393         emitOperandHelper(dst, src);
 394     }
 395 
 396     public final void addl(Register dst, Register src) {
 397         int encode = prefixAndEncode(dst.encoding, src.encoding);
 398         emitByte(0x03);
 399         emitByte(0xC0 | encode);
 400     }
 401 
 402     private void addrNop4() {
 403         // 4 bytes: NOP DWORD PTR [EAX+0]
 404         emitByte(0x0F);
 405         emitByte(0x1F);
 406         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
 407         emitByte(0); // 8-bits offset (1 byte)
 408     }
 409 
 410     private void addrNop5() {
 411         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 412         emitByte(0x0F);
 413         emitByte(0x1F);
 414         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
 415         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
 416         emitByte(0); // 8-bits offset (1 byte)
 417     }
 418 
 419     private void addrNop7() {
 420         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 421         emitByte(0x0F);
 422         emitByte(0x1F);
 423         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
 424         emitInt(0); // 32-bits offset (4 bytes)
 425     }
 426 
 427     private void addrNop8() {
 428         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 429         emitByte(0x0F);
 430         emitByte(0x1F);
 431         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
 432         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
 433         emitInt(0); // 32-bits offset (4 bytes)
 434     }
 435 
 436     public final void addsd(Register dst, Register src) {
 437         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
 438         emitByte(0xF2);
 439         int encode = prefixAndEncode(dst.encoding, src.encoding);
 440         emitByte(0x0F);
 441         emitByte(0x58);
 442         emitByte(0xC0 | encode);
 443     }
 444 
 445     public final void addsd(Register dst, AMD64Address src) {
 446         assert dst.getRegisterCategory().equals(AMD64.XMM);
 447         emitByte(0xF2);
 448         prefix(src, dst);
 449         emitByte(0x0F);
 450         emitByte(0x58);
 451         emitOperandHelper(dst, src);
 452     }
 453 
 454     public final void addss(Register dst, Register src) {
 455         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
 456         emitByte(0xF3);
 457         int encode = prefixAndEncode(dst.encoding, src.encoding);
 458         emitByte(0x0F);
 459         emitByte(0x58);
 460         emitByte(0xC0 | encode);
 461     }
 462 
 463     public final void addss(Register dst, AMD64Address src) {
 464         assert dst.getRegisterCategory().equals(AMD64.XMM);
 465         emitByte(0xF3);
 466         prefix(src, dst);
 467         emitByte(0x0F);
 468         emitByte(0x58);
 469         emitOperandHelper(dst, src);
 470     }
 471 
 472     public final void andl(Register dst, int imm32) {
 473         emitArithImm32(4, dst, imm32);
 474     }
 475 
 476     public final void andl(Register dst, AMD64Address src) {
 477         prefix(src, dst);
 478         emitByte(0x23);
 479         emitOperandHelper(dst, src);
 480     }
 481 
 482     public final void andl(Register dst, Register src) {
 483         int encode = prefixAndEncode(dst.encoding, src.encoding);
 484         emitByte(0x23);
 485         emitByte(0xC0 | encode);
 486     }
 487 
 488     public final void bsfq(Register dst, Register src) {
 489         int encode = prefixqAndEncode(dst.encoding, src.encoding);
 490         emitByte(0x0F);
 491         emitByte(0xBC);
 492         emitByte(0xC0 | encode);
 493     }
 494 
 495     public final void bsfq(Register dst, AMD64Address src) {
 496         prefixq(src, dst);
 497         emitByte(0x0F);
 498         emitByte(0xBC);
 499         emitOperandHelper(dst, src);
 500     }
 501 
 502     public final void bsrq(Register dst, Register src) {
 503         int encode = prefixqAndEncode(dst.encoding, src.encoding);
 504         emitByte(0x0F);
 505         emitByte(0xBD);
 506         emitByte(0xC0 | encode);
 507     }
 508 
 509     public final void bsrq(Register dst, AMD64Address src) {
 510         prefixq(src, dst);
 511         emitByte(0x0F);
 512         emitByte(0xBD);
 513         emitOperandHelper(dst, src);
 514     }
 515 
 516     public final void bsrl(Register dst, Register src) {
 517         int encode = prefixAndEncode(dst.encoding, src.encoding);
 518         emitByte(0x0F);
 519         emitByte(0xBD);
 520         emitByte(0xC0 | encode);
 521     }
 522 
 523     public final void bsrl(Register dst, AMD64Address src) {
 524         prefix(src, dst);
 525         emitByte(0x0F);
 526         emitByte(0xBD);
 527         emitOperandHelper(dst, src);
 528     }
 529 
 530     public final void bswapl(Register reg) {
 531         int encode = prefixAndEncode(reg.encoding);
 532         emitByte(0x0F);
 533         emitByte(0xC8 | encode);
 534     }
 535 
 536     public final void cdql() {
 537         emitByte(0x99);
 538     }
 539 
 540     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
 541         int encode = prefixAndEncode(dst.encoding, src.encoding);
 542         emitByte(0x0F);
 543         emitByte(0x40 | cc.getValue());
 544         emitByte(0xC0 | encode);
 545     }
 546 
 547     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
 548         prefix(src, dst);
 549         emitByte(0x0F);
 550         emitByte(0x40 | cc.getValue());
 551         emitOperandHelper(dst, src);
 552     }
 553 
 554     public final void cmpb(Register dst, int imm8) {
 555         emitArithImm8(7, dst, imm8);
 556     }
 557 
 558     public final void cmpb(Register dst, Register src) {
 559         int encode = prefixAndEncode(dst.encoding, true, src.encoding, true);
 560         emitByte(0x3A);
 561         emitByte(0xC0 | encode);
 562     }
 563 
 564     public final void cmpb(Register dst, AMD64Address src) {
 565         prefix(src, dst, true);
 566         emitByte(0x3A);
 567         emitOperandHelper(dst, src);
 568     }
 569 
 570     public final void cmpb(AMD64Address dst, int imm8) {
 571         emitArithImm8(7, dst, imm8);
 572     }
 573 
 574     public final void cmpw(Register dst, int imm16) {
 575         emitArithImm16(7, dst, imm16);
 576     }
 577 
 578     public final void cmpw(Register dst, Register src) {
 579         emitByte(0x66);
 580         int encode = prefixAndEncode(dst.encoding, src.encoding);
 581         emitByte(0x3B);
 582         emitByte(0xC0 | encode);
 583     }
 584 
 585     public final void cmpw(Register dst, AMD64Address src) {
 586         emitByte(0x66);
 587         prefix(src, dst);
 588         emitByte(0x3B);
 589         emitOperandHelper(dst, src);
 590     }
 591 
 592     public final void cmpw(AMD64Address dst, int imm16) {
 593         emitArithImm16(7, dst, imm16);
 594     }
 595 
 596     public final void cmpl(Register dst, int imm32) {
 597         emitArithImm32(7, dst, imm32);
 598     }
 599 
 600     public final void cmpl(Register dst, Register src) {
 601         int encode = prefixAndEncode(dst.encoding, src.encoding);
 602         emitByte(0x3B);
 603         emitByte(0xC0 | encode);
 604     }
 605 
 606     public final void cmpl(Register dst, AMD64Address src) {
 607         prefix(src, dst);
 608         emitByte(0x3B);
 609         emitOperandHelper(dst, src);
 610     }
 611 
 612     public final void cmpl(AMD64Address dst, int imm32) {
 613         emitArithImm32(7, dst, imm32);
 614     }
 615 
 616     // The 32-bit cmpxchg compares the value at adr with the contents of X86.rax,
 617     // and stores reg into adr if so; otherwise, the value at adr is loaded into X86.rax,.
 618     // The ZF is set if the compared values were equal, and cleared otherwise.
 619     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
 620         prefix(adr, reg);
 621         emitByte(0x0F);
 622         emitByte(0xB1);
 623         emitOperandHelper(reg, adr);
 624     }
 625 
 626     public final void cvtsd2ss(Register dst, AMD64Address src) {
 627         assert dst.getRegisterCategory().equals(AMD64.XMM);
 628         emitByte(0xF2);
 629         prefix(src, dst);
 630         emitByte(0x0F);
 631         emitByte(0x5A);
 632         emitOperandHelper(dst, src);
 633     }
 634 
 635     public final void cvtsd2ss(Register dst, Register src) {
 636         assert dst.getRegisterCategory().equals(AMD64.XMM);
 637         assert src.getRegisterCategory().equals(AMD64.XMM);
 638         emitByte(0xF2);
 639         int encode = prefixAndEncode(dst.encoding, src.encoding);
 640         emitByte(0x0F);
 641         emitByte(0x5A);
 642         emitByte(0xC0 | encode);
 643     }
 644 
 645     public final void cvtsi2sdl(Register dst, AMD64Address src) {
 646         assert dst.getRegisterCategory().equals(AMD64.XMM);
 647         emitByte(0xF2);
 648         prefix(src, dst);
 649         emitByte(0x0F);
 650         emitByte(0x2A);
 651         emitOperandHelper(dst, src);
 652     }
 653 
 654     public final void cvtsi2sdl(Register dst, Register src) {
 655         assert dst.getRegisterCategory().equals(AMD64.XMM);
 656         emitByte(0xF2);
 657         int encode = prefixAndEncode(dst.encoding, src.encoding);
 658         emitByte(0x0F);
 659         emitByte(0x2A);
 660         emitByte(0xC0 | encode);
 661     }
 662 
 663     public final void cvtsi2ssl(Register dst, AMD64Address src) {
 664         assert dst.getRegisterCategory().equals(AMD64.XMM);
 665         emitByte(0xF3);
 666         prefix(src, dst);
 667         emitByte(0x0F);
 668         emitByte(0x2A);
 669         emitOperandHelper(dst, src);
 670     }
 671 
 672     public final void cvtsi2ssl(Register dst, Register src) {
 673         assert dst.getRegisterCategory().equals(AMD64.XMM);
 674         emitByte(0xF3);
 675         int encode = prefixAndEncode(dst.encoding, src.encoding);
 676         emitByte(0x0F);
 677         emitByte(0x2A);
 678         emitByte(0xC0 | encode);
 679     }
 680 
 681     public final void cvtss2sd(Register dst, AMD64Address src) {
 682         assert dst.getRegisterCategory().equals(AMD64.XMM);
 683         emitByte(0xF3);
 684         prefix(src, dst);
 685         emitByte(0x0F);
 686         emitByte(0x5A);
 687         emitOperandHelper(dst, src);
 688     }
 689 
 690     public final void cvtss2sd(Register dst, Register src) {
 691         assert dst.getRegisterCategory().equals(AMD64.XMM);
 692         assert src.getRegisterCategory().equals(AMD64.XMM);
 693         emitByte(0xF3);
 694         int encode = prefixAndEncode(dst.encoding, src.encoding);
 695         emitByte(0x0F);
 696         emitByte(0x5A);
 697         emitByte(0xC0 | encode);
 698     }
 699 
 700     public final void cvttsd2sil(Register dst, AMD64Address src) {
 701         emitByte(0xF2);
 702         prefix(src, dst);
 703         emitByte(0x0F);
 704         emitByte(0x2C);
 705         emitOperandHelper(dst, src);
 706     }
 707 
 708     public final void cvttsd2sil(Register dst, Register src) {
 709         assert src.getRegisterCategory().equals(AMD64.XMM);
 710         emitByte(0xF2);
 711         int encode = prefixAndEncode(dst.encoding, src.encoding);
 712         emitByte(0x0F);
 713         emitByte(0x2C);
 714         emitByte(0xC0 | encode);
 715     }
 716 
 717     public final void cvttss2sil(Register dst, AMD64Address src) {
 718         emitByte(0xF3);
 719         prefix(src, dst);
 720         emitByte(0x0F);
 721         emitByte(0x2C);
 722         emitOperandHelper(dst, src);
 723     }
 724 
 725     public final void cvttss2sil(Register dst, Register src) {
 726         assert src.getRegisterCategory().equals(AMD64.XMM);
 727         emitByte(0xF3);
 728         int encode = prefixAndEncode(dst.encoding, src.encoding);
 729         emitByte(0x0F);
 730         emitByte(0x2C);
 731         emitByte(0xC0 | encode);
 732     }
 733 
 734     protected final void decl(AMD64Address dst) {
 735         prefix(dst);
 736         emitByte(0xFF);
 737         emitOperandHelper(1, dst);
 738     }
 739 
 740     public final void divsd(Register dst, AMD64Address src) {
 741         assert dst.getRegisterCategory().equals(AMD64.XMM);
 742         emitByte(0xF2);
 743         prefix(src, dst);
 744         emitByte(0x0F);
 745         emitByte(0x5E);
 746         emitOperandHelper(dst, src);
 747     }
 748 
 749     public final void divsd(Register dst, Register src) {
 750         assert dst.getRegisterCategory().equals(AMD64.XMM);
 751         assert src.getRegisterCategory().equals(AMD64.XMM);
 752         emitByte(0xF2);
 753         int encode = prefixAndEncode(dst.encoding, src.encoding);
 754         emitByte(0x0F);
 755         emitByte(0x5E);
 756         emitByte(0xC0 | encode);
 757     }
 758 
 759     public final void divss(Register dst, AMD64Address src) {
 760         assert dst.getRegisterCategory().equals(AMD64.XMM);
 761         emitByte(0xF3);
 762         prefix(src, dst);
 763         emitByte(0x0F);
 764         emitByte(0x5E);
 765         emitOperandHelper(dst, src);
 766     }
 767 
 768     public final void divss(Register dst, Register src) {
 769         assert dst.getRegisterCategory().equals(AMD64.XMM);
 770         assert src.getRegisterCategory().equals(AMD64.XMM);
 771         emitByte(0xF3);
 772         int encode = prefixAndEncode(dst.encoding, src.encoding);
 773         emitByte(0x0F);
 774         emitByte(0x5E);
 775         emitByte(0xC0 | encode);
 776     }
 777 
 778     public final void hlt() {
 779         emitByte(0xF4);
 780     }
 781 
 782     public final void idivl(Register src) {
 783         int encode = prefixAndEncode(7, src.encoding);
 784         emitByte(0xF7);
 785         emitByte(0xC0 | encode);
 786     }
 787 
 788     public final void divl(Register src) {
 789         int encode = prefixAndEncode(6, src.encoding);
 790         emitByte(0xF7);
 791         emitByte(0xC0 | encode);
 792     }
 793 
 794     public final void mull(Register src) {
 795         int encode = prefixAndEncode(4, src.encoding);
 796         emitByte(0xF7);
 797         emitByte(0xC0 | encode);
 798     }
 799 
 800     public final void mull(AMD64Address src) {
 801         prefix(src);
 802         emitByte(0xF7);
 803         emitOperandHelper(4, src);
 804     }
 805 
 806     public final void imull(Register src) {
 807         int encode = prefixAndEncode(5, src.encoding);
 808         emitByte(0xF7);
 809         emitByte(0xC0 | encode);
 810     }
 811 
 812     public final void imull(AMD64Address src) {
 813         prefix(src);
 814         emitByte(0xF7);
 815         emitOperandHelper(5, src);
 816     }
 817 
 818     public final void imull(Register dst, Register src) {
 819         int encode = prefixAndEncode(dst.encoding, src.encoding);
 820         emitByte(0x0F);
 821         emitByte(0xAF);
 822         emitByte(0xC0 | encode);
 823     }
 824 
 825     public final void imull(Register dst, AMD64Address src) {
 826         prefix(src, dst);
 827         emitByte(0x0F);
 828         emitByte(0xAF);
 829         emitOperandHelper(dst, src);
 830     }
 831 
 832     public final void imull(Register dst, Register src, int value) {
 833         int encode = prefixAndEncode(dst.encoding, src.encoding);
 834         if (isByte(value)) {
 835             emitByte(0x6B);
 836             emitByte(0xC0 | encode);
 837             emitByte(value & 0xFF);
 838         } else {
 839             emitByte(0x69);
 840             emitByte(0xC0 | encode);
 841             emitInt(value);
 842         }
 843     }
 844 
 845     protected final void incl(AMD64Address dst) {
 846         prefix(dst);
 847         emitByte(0xFF);
 848         emitOperandHelper(0, dst);
 849     }
 850 
 851     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
 852         int shortSize = 2;
 853         int longSize = 6;
 854         long disp = jumpTarget - position();
 855         if (!forceDisp32 && isByte(disp - shortSize)) {
 856             // 0111 tttn #8-bit disp
 857             emitByte(0x70 | cc.getValue());
 858             emitByte((int) ((disp - shortSize) & 0xFF));
 859         } else {
 860             // 0000 1111 1000 tttn #32-bit disp
 861             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
 862             emitByte(0x0F);
 863             emitByte(0x80 | cc.getValue());
 864             emitInt((int) (disp - longSize));
 865         }
 866     }
 867 
 868     public final void jcc(ConditionFlag cc, Label l) {
 869         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
 870         if (l.isBound()) {
 871             jcc(cc, l.position(), false);
 872         } else {
 873             // Note: could eliminate cond. jumps to this jump if condition
 874             // is the same however, seems to be rather unlikely case.
 875             // Note: use jccb() if label to be bound is very close to get
 876             // an 8-bit displacement
 877             l.addPatchAt(position());
 878             emitByte(0x0F);
 879             emitByte(0x80 | cc.getValue());
 880             emitInt(0);
 881         }
 882 
 883     }
 884 
 885     public final void jccb(ConditionFlag cc, Label l) {
 886         if (l.isBound()) {
 887             int shortSize = 2;
 888             int entry = l.position();
 889             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
 890             long disp = entry - position();
 891             // 0111 tttn #8-bit disp
 892             emitByte(0x70 | cc.getValue());
 893             emitByte((int) ((disp - shortSize) & 0xFF));
 894         } else {
 895             l.addPatchAt(position());
 896             emitByte(0x70 | cc.getValue());
 897             emitByte(0);
 898         }
 899     }
 900 
 901     public final void jmp(int jumpTarget, boolean forceDisp32) {
 902         int shortSize = 2;
 903         int longSize = 5;
 904         long disp = jumpTarget - position();
 905         if (!forceDisp32 && isByte(disp - shortSize)) {
 906             emitByte(0xEB);
 907             emitByte((int) ((disp - shortSize) & 0xFF));
 908         } else {
 909             emitByte(0xE9);
 910             emitInt((int) (disp - longSize));
 911         }
 912     }
 913 
 914     @Override
 915     public final void jmp(Label l) {
 916         if (l.isBound()) {
 917             jmp(l.position(), false);
 918         } else {
 919             // By default, forward jumps are always 32-bit displacements, since
 920             // we can't yet know where the label will be bound. If you're sure that
 921             // the forward jump will not run beyond 256 bytes, use jmpb to
 922             // force an 8-bit displacement.
 923 
 924             l.addPatchAt(position());
 925             emitByte(0xE9);
 926             emitInt(0);
 927         }
 928     }
 929 
 930     public final void jmp(Register entry) {
 931         int encode = prefixAndEncode(entry.encoding);
 932         emitByte(0xFF);
 933         emitByte(0xE0 | encode);
 934     }
 935 
 936     public final void jmpb(Label l) {
 937         if (l.isBound()) {
 938             int shortSize = 2;
 939             int entry = l.position();
 940             assert isByte((entry - position()) + shortSize) : "Dispacement too large for a short jmp";
 941             long offs = entry - position();
 942             emitByte(0xEB);
 943             emitByte((int) ((offs - shortSize) & 0xFF));
 944         } else {
 945 
 946             l.addPatchAt(position());
 947             emitByte(0xEB);
 948             emitByte(0);
 949         }
 950     }
 951 
 952     public final void leaq(Register dst, AMD64Address src) {
 953         prefixq(src, dst);
 954         emitByte(0x8D);
 955         emitOperandHelper(dst, src);
 956     }
 957 
 958     public final void leave() {
 959         emitByte(0xC9);
 960     }
 961 
 962     public final void lock() {
 963         emitByte(0xF0);
 964     }
 965 
 966     public final void movapd(Register dst, Register src) {
 967         assert dst.getRegisterCategory().equals(AMD64.XMM);
 968         assert src.getRegisterCategory().equals(AMD64.XMM);
 969         int dstenc = dst.encoding;
 970         int srcenc = src.encoding;
 971         emitByte(0x66);
 972         if (dstenc < 8) {
 973             if (srcenc >= 8) {
 974                 emitByte(Prefix.REXB);
 975                 srcenc -= 8;
 976             }
 977         } else {
 978             if (srcenc < 8) {
 979                 emitByte(Prefix.REXR);
 980             } else {
 981                 emitByte(Prefix.REXRB);
 982                 srcenc -= 8;
 983             }
 984             dstenc -= 8;
 985         }
 986         emitByte(0x0F);
 987         emitByte(0x28);
 988         emitByte(0xC0 | dstenc << 3 | srcenc);
 989     }
 990 
 991     public final void movaps(Register dst, Register src) {
 992         assert dst.getRegisterCategory().equals(AMD64.XMM);
 993         assert src.getRegisterCategory().equals(AMD64.XMM);
 994         int dstenc = dst.encoding;
 995         int srcenc = src.encoding;
 996         if (dstenc < 8) {
 997             if (srcenc >= 8) {
 998                 emitByte(Prefix.REXB);
 999                 srcenc -= 8;
1000             }
1001         } else {
1002             if (srcenc < 8) {
1003                 emitByte(Prefix.REXR);
1004             } else {
1005                 emitByte(Prefix.REXRB);
1006                 srcenc -= 8;
1007             }
1008             dstenc -= 8;
1009         }
1010         emitByte(0x0F);
1011         emitByte(0x28);
1012         emitByte(0xC0 | dstenc << 3 | srcenc);
1013     }
1014 
1015     public final void movb(AMD64Address dst, int imm8) {
1016         prefix(dst);
1017         emitByte(0xC6);
1018         emitOperandHelper(0, dst);
1019         emitByte(imm8);
1020     }
1021 
1022     public final void movb(AMD64Address dst, Register src) {
1023         assert src.getRegisterCategory().equals(AMD64.CPU) : "must have byte register";
1024         prefix(dst, src, true);
1025         emitByte(0x88);
1026         emitOperandHelper(src, dst);
1027     }
1028 
1029     public final void movdl(Register dst, Register src) {
1030         if (dst.getRegisterCategory().equals(AMD64.XMM)) {
1031             assert !src.getRegisterCategory().equals(AMD64.XMM) : "does this hold?";
1032             emitByte(0x66);
1033             int encode = prefixAndEncode(dst.encoding, src.encoding);
1034             emitByte(0x0F);
1035             emitByte(0x6E);
1036             emitByte(0xC0 | encode);
1037         } else if (src.getRegisterCategory().equals(AMD64.XMM)) {
1038             assert !dst.getRegisterCategory().equals(AMD64.XMM);
1039             emitByte(0x66);
1040             // swap src/dst to get correct prefix
1041             int encode = prefixAndEncode(src.encoding, dst.encoding);
1042             emitByte(0x0F);
1043             emitByte(0x7E);
1044             emitByte(0xC0 | encode);
1045         }
1046     }
1047 
1048     public final void movl(Register dst, int imm32) {
1049         int encode = prefixAndEncode(dst.encoding);
1050         emitByte(0xB8 | encode);
1051         emitInt(imm32);
1052     }
1053 
1054     public final void movl(Register dst, Register src) {
1055         int encode = prefixAndEncode(dst.encoding, src.encoding);
1056         emitByte(0x8B);
1057         emitByte(0xC0 | encode);
1058     }
1059 
1060     public final void movl(Register dst, AMD64Address src) {
1061         prefix(src, dst);
1062         emitByte(0x8B);
1063         emitOperandHelper(dst, src);
1064     }
1065 
1066     public final void movl(AMD64Address dst, int imm32) {
1067         prefix(dst);
1068         emitByte(0xC7);
1069         emitOperandHelper(0, dst);
1070         emitInt(imm32);
1071     }
1072 
1073     public final void movl(AMD64Address dst, Register src) {
1074         prefix(dst, src);
1075         emitByte(0x89);
1076         emitOperandHelper(src, dst);
1077     }
1078 
1079     /**
1080      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
1081      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
1082      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
1083      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
1084      */
1085     public final void movlpd(Register dst, AMD64Address src) {
1086         assert dst.getRegisterCategory().equals(AMD64.XMM);
1087         emitByte(0x66);
1088         prefix(src, dst);
1089         emitByte(0x0F);
1090         emitByte(0x12);
1091         emitOperandHelper(dst, src);
1092     }
1093 
1094     public final void movq(Register dst, AMD64Address src) {
1095         if (dst.getRegisterCategory().equals(AMD64.XMM)) {
1096             emitByte(0xF3);
1097             prefixq(src, dst);
1098             emitByte(0x0F);
1099             emitByte(0x7E);
1100             emitOperandHelper(dst, src);
1101         } else {
1102             prefixq(src, dst);
1103             emitByte(0x8B);
1104             emitOperandHelper(dst, src);
1105         }
1106     }
1107 
1108     public final void movq(Register dst, Register src) {
1109         int encode = prefixqAndEncode(dst.encoding, src.encoding);
1110         emitByte(0x8B);
1111         emitByte(0xC0 | encode);
1112     }
1113 
1114     public final void movq(AMD64Address dst, Register src) {
1115         if (src.getRegisterCategory().equals(AMD64.XMM)) {
1116             emitByte(0x66);
1117             prefixq(dst, src);
1118             emitByte(0x0F);
1119             emitByte(0xD6);
1120             emitOperandHelper(src, dst);
1121         } else {
1122             prefixq(dst, src);
1123             emitByte(0x89);
1124             emitOperandHelper(src, dst);
1125         }
1126     }
1127 
1128     public final void movsbl(Register dst, AMD64Address src) {
1129         prefix(src, dst);
1130         emitByte(0x0F);
1131         emitByte(0xBE);
1132         emitOperandHelper(dst, src);
1133     }
1134 
1135     public final void movsbl(Register dst, Register src) {
1136         int encode = prefixAndEncode(dst.encoding, false, src.encoding, true);
1137         emitByte(0x0F);
1138         emitByte(0xBE);
1139         emitByte(0xC0 | encode);
1140     }
1141 
1142     public final void movsbq(Register dst, AMD64Address src) {
1143         prefixq(src, dst);
1144         emitByte(0x0F);
1145         emitByte(0xBE);
1146         emitOperandHelper(dst, src);
1147     }
1148 
1149     public final void movsbq(Register dst, Register src) {
1150         int encode = prefixqAndEncode(dst.encoding, src.encoding);
1151         emitByte(0x0F);
1152         emitByte(0xBE);
1153         emitByte(0xC0 | encode);
1154     }
1155 
1156     public final void movsd(Register dst, Register src) {
1157         assert dst.getRegisterCategory().equals(AMD64.XMM);
1158         assert src.getRegisterCategory().equals(AMD64.XMM);
1159         emitByte(0xF2);
1160         int encode = prefixAndEncode(dst.encoding, src.encoding);
1161         emitByte(0x0F);
1162         emitByte(0x10);
1163         emitByte(0xC0 | encode);
1164     }
1165 
1166     public final void movsd(Register dst, AMD64Address src) {
1167         assert dst.getRegisterCategory().equals(AMD64.XMM);
1168         emitByte(0xF2);
1169         prefix(src, dst);
1170         emitByte(0x0F);
1171         emitByte(0x10);
1172         emitOperandHelper(dst, src);
1173     }
1174 
1175     public final void movsd(AMD64Address dst, Register src) {
1176         assert src.getRegisterCategory().equals(AMD64.XMM);
1177         emitByte(0xF2);
1178         prefix(dst, src);
1179         emitByte(0x0F);
1180         emitByte(0x11);
1181         emitOperandHelper(src, dst);
1182     }
1183 
1184     public final void movss(Register dst, Register src) {
1185         assert dst.getRegisterCategory().equals(AMD64.XMM);
1186         assert src.getRegisterCategory().equals(AMD64.XMM);
1187         emitByte(0xF3);
1188         int encode = prefixAndEncode(dst.encoding, src.encoding);
1189         emitByte(0x0F);
1190         emitByte(0x10);
1191         emitByte(0xC0 | encode);
1192     }
1193 
1194     public final void movss(Register dst, AMD64Address src) {
1195         assert dst.getRegisterCategory().equals(AMD64.XMM);
1196         emitByte(0xF3);
1197         prefix(src, dst);
1198         emitByte(0x0F);
1199         emitByte(0x10);
1200         emitOperandHelper(dst, src);
1201     }
1202 
1203     public final void movss(AMD64Address dst, Register src) {
1204         assert src.getRegisterCategory().equals(AMD64.XMM);
1205         emitByte(0xF3);
1206         prefix(dst, src);
1207         emitByte(0x0F);
1208         emitByte(0x11);
1209         emitOperandHelper(src, dst);
1210     }
1211 
1212     public final void movswl(Register dst, AMD64Address src) {
1213         prefix(src, dst);
1214         emitByte(0x0F);
1215         emitByte(0xBF);
1216         emitOperandHelper(dst, src);
1217     }
1218 
1219     public final void movswl(Register dst, Register src) {
1220         int encode = prefixAndEncode(dst.encoding, src.encoding);
1221         emitByte(0x0F);
1222         emitByte(0xBF);
1223         emitByte(0xC0 | encode);
1224     }
1225 
1226     public final void movswq(Register dst, AMD64Address src) {
1227         prefixq(src, dst);
1228         emitByte(0x0F);
1229         emitByte(0xBF);
1230         emitOperandHelper(dst, src);
1231     }
1232 
1233     public final void movswq(Register dst, Register src) {
1234         int encode = prefixqAndEncode(dst.encoding, src.encoding);
1235         emitByte(0x0F);
1236         emitByte(0xBF);
1237         emitByte(0xC0 | encode);
1238     }
1239 
1240     public final void movw(AMD64Address dst, int imm16) {
1241         emitByte(0x66); // switch to 16-bit mode
1242         prefix(dst);
1243         emitByte(0xC7);
1244         emitOperandHelper(0, dst);
1245         emitShort(imm16);
1246     }
1247 
1248     public final void movw(AMD64Address dst, Register src) {
1249         emitByte(0x66);
1250         prefix(dst, src);
1251         emitByte(0x89);
1252         emitOperandHelper(src, dst);
1253     }
1254 
1255     public final void movzbl(Register dst, AMD64Address src) {
1256         prefix(src, dst);
1257         emitByte(0x0F);
1258         emitByte(0xB6);
1259         emitOperandHelper(dst, src);
1260     }
1261 
1262     public final void movzwl(Register dst, AMD64Address src) {
1263         prefix(src, dst);
1264         emitByte(0x0F);
1265         emitByte(0xB7);
1266         emitOperandHelper(dst, src);
1267     }
1268 
1269     public final void mulsd(Register dst, AMD64Address src) {
1270         assert dst.getRegisterCategory().equals(AMD64.XMM);
1271         emitByte(0xF2);
1272         prefix(src, dst);
1273         emitByte(0x0F);
1274         emitByte(0x59);
1275         emitOperandHelper(dst, src);
1276     }
1277 
1278     public final void mulsd(Register dst, Register src) {
1279         assert dst.getRegisterCategory().equals(AMD64.XMM);
1280         assert src.getRegisterCategory().equals(AMD64.XMM);
1281 
1282         emitByte(0xF2);
1283         int encode = prefixAndEncode(dst.encoding, src.encoding);
1284         emitByte(0x0F);
1285         emitByte(0x59);
1286         emitByte(0xC0 | encode);
1287     }
1288 
1289     public final void mulss(Register dst, AMD64Address src) {
1290         assert dst.getRegisterCategory().equals(AMD64.XMM);
1291 
1292         emitByte(0xF3);
1293         prefix(src, dst);
1294         emitByte(0x0F);
1295         emitByte(0x59);
1296         emitOperandHelper(dst, src);
1297     }
1298 
1299     public final void mulss(Register dst, Register src) {
1300         assert dst.getRegisterCategory().equals(AMD64.XMM);
1301         assert src.getRegisterCategory().equals(AMD64.XMM);
1302         emitByte(0xF3);
1303         int encode = prefixAndEncode(dst.encoding, src.encoding);
1304         emitByte(0x0F);
1305         emitByte(0x59);
1306         emitByte(0xC0 | encode);
1307     }
1308 
1309     public final void negl(Register dst) {
1310         int encode = prefixAndEncode(dst.encoding);
1311         emitByte(0xF7);
1312         emitByte(0xD8 | encode);
1313     }
1314 
1315     public final void notl(Register dst) {
1316         int encode = prefixAndEncode(dst.encoding);
1317         emitByte(0xF7);
1318         emitByte(0xD0 | encode);
1319     }
1320 
1321     @Override
1322     public final void ensureUniquePC() {
1323         nop();
1324     }
1325 
1326     public final void lzcntl(Register dst, Register src) {
1327         assert supports(CPUFeature.LZCNT);
1328         emitByte(0xF3);
1329         int encode = prefixAndEncode(dst.encoding, src.encoding);
1330         emitByte(0x0F);
1331         emitByte(0xBD);
1332         emitByte(0xC0 | encode);
1333     }
1334 
1335     public final void lzcntq(Register dst, Register src) {
1336         assert supports(CPUFeature.LZCNT);
1337         emitByte(0xF3);
1338         int encode = prefixqAndEncode(dst.encoding, src.encoding);
1339         emitByte(0x0F);
1340         emitByte(0xBD);
1341         emitByte(0xC0 | encode);
1342     }
1343 
1344     public final void lzcntl(Register dst, AMD64Address src) {
1345         assert supports(CPUFeature.LZCNT);
1346         emitByte(0xF3);
1347         prefix(src, dst);
1348         emitByte(0x0F);
1349         emitByte(0xBD);
1350         emitOperandHelper(dst, src);
1351     }
1352 
1353     public final void lzcntq(Register dst, AMD64Address src) {
1354         assert supports(CPUFeature.LZCNT);
1355         emitByte(0xF3);
1356         prefixq(src, dst);
1357         emitByte(0x0F);
1358         emitByte(0xBD);
1359         emitOperandHelper(dst, src);
1360     }
1361 
1362     public final void nop() {
1363         nop(1);
1364     }
1365 
1366     public void nop(int count) {
1367         int i = count;
1368         if (UseNormalNop) {
1369             assert i > 0 : " ";
1370             // The fancy nops aren't currently recognized by debuggers making it a
1371             // pain to disassemble code while debugging. If assert are on clearly
1372             // speed is not an issue so simply use the single byte traditional nop
1373             // to do alignment.
1374 
1375             for (; i > 0; i--) {
1376                 emitByte(0x90);
1377             }
1378             return;
1379         }
1380 
1381         if (UseAddressNop) {
1382             //
1383             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
1384             // 1: 0x90
1385             // 2: 0x66 0x90
1386             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
1387             // 4: 0x0F 0x1F 0x40 0x00
1388             // 5: 0x0F 0x1F 0x44 0x00 0x00
1389             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
1390             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1391             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1392             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1393             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1394             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1395 
1396             // The rest coding is AMD specific - use consecutive Address nops
1397 
1398             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
1399             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
1400             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1401             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1402             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1403             // Size prefixes (0x66) are added for larger sizes
1404 
1405             while (i >= 22) {
1406                 i -= 11;
1407                 emitByte(0x66); // size prefix
1408                 emitByte(0x66); // size prefix
1409                 emitByte(0x66); // size prefix
1410                 addrNop8();
1411             }
1412             // Generate first nop for size between 21-12
1413             switch (i) {
1414                 case 21:
1415                     i -= 1;
1416                     emitByte(0x66); // size prefix
1417                     // fall through
1418                 case 20:
1419                     // fall through
1420                 case 19:
1421                     i -= 1;
1422                     emitByte(0x66); // size prefix
1423                     // fall through
1424                 case 18:
1425                     // fall through
1426                 case 17:
1427                     i -= 1;
1428                     emitByte(0x66); // size prefix
1429                     // fall through
1430                 case 16:
1431                     // fall through
1432                 case 15:
1433                     i -= 8;
1434                     addrNop8();
1435                     break;
1436                 case 14:
1437                 case 13:
1438                     i -= 7;
1439                     addrNop7();
1440                     break;
1441                 case 12:
1442                     i -= 6;
1443                     emitByte(0x66); // size prefix
1444                     addrNop5();
1445                     break;
1446                 default:
1447                     assert i < 12;
1448             }
1449 
1450             // Generate second nop for size between 11-1
1451             switch (i) {
1452                 case 11:
1453                     emitByte(0x66); // size prefix
1454                     emitByte(0x66); // size prefix
1455                     emitByte(0x66); // size prefix
1456                     addrNop8();
1457                     break;
1458                 case 10:
1459                     emitByte(0x66); // size prefix
1460                     emitByte(0x66); // size prefix
1461                     addrNop8();
1462                     break;
1463                 case 9:
1464                     emitByte(0x66); // size prefix
1465                     addrNop8();
1466                     break;
1467                 case 8:
1468                     addrNop8();
1469                     break;
1470                 case 7:
1471                     addrNop7();
1472                     break;
1473                 case 6:
1474                     emitByte(0x66); // size prefix
1475                     addrNop5();
1476                     break;
1477                 case 5:
1478                     addrNop5();
1479                     break;
1480                 case 4:
1481                     addrNop4();
1482                     break;
1483                 case 3:
1484                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
1485                     emitByte(0x66); // size prefix
1486                     emitByte(0x66); // size prefix
1487                     emitByte(0x90); // nop
1488                     break;
1489                 case 2:
1490                     emitByte(0x66); // size prefix
1491                     emitByte(0x90); // nop
1492                     break;
1493                 case 1:
1494                     emitByte(0x90); // nop
1495                     break;
1496                 default:
1497                     assert i == 0;
1498             }
1499             return;
1500         }
1501 
1502         // Using nops with size prefixes "0x66 0x90".
1503         // From AMD Optimization Guide:
1504         // 1: 0x90
1505         // 2: 0x66 0x90
1506         // 3: 0x66 0x66 0x90
1507         // 4: 0x66 0x66 0x66 0x90
1508         // 5: 0x66 0x66 0x90 0x66 0x90
1509         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
1510         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
1511         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
1512         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
1513         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
1514         //
1515         while (i > 12) {
1516             i -= 4;
1517             emitByte(0x66); // size prefix
1518             emitByte(0x66);
1519             emitByte(0x66);
1520             emitByte(0x90); // nop
1521         }
1522         // 1 - 12 nops
1523         if (i > 8) {
1524             if (i > 9) {
1525                 i -= 1;
1526                 emitByte(0x66);
1527             }
1528             i -= 3;
1529             emitByte(0x66);
1530             emitByte(0x66);
1531             emitByte(0x90);
1532         }
1533         // 1 - 8 nops
1534         if (i > 4) {
1535             if (i > 6) {
1536                 i -= 1;
1537                 emitByte(0x66);
1538             }
1539             i -= 3;
1540             emitByte(0x66);
1541             emitByte(0x66);
1542             emitByte(0x90);
1543         }
1544         switch (i) {
1545             case 4:
1546                 emitByte(0x66);
1547                 emitByte(0x66);
1548                 emitByte(0x66);
1549                 emitByte(0x90);
1550                 break;
1551             case 3:
1552                 emitByte(0x66);
1553                 emitByte(0x66);
1554                 emitByte(0x90);
1555                 break;
1556             case 2:
1557                 emitByte(0x66);
1558                 emitByte(0x90);
1559                 break;
1560             case 1:
1561                 emitByte(0x90);
1562                 break;
1563             default:
1564                 assert i == 0;
1565         }
1566     }
1567 
1568     public final void orl(Register dst, int imm32) {
1569         emitArithImm32(1, dst, imm32);
1570     }
1571 
1572     public final void orl(Register dst, AMD64Address src) {
1573         prefix(src, dst);
1574         emitByte(0x0B);
1575         emitOperandHelper(dst, src);
1576     }
1577 
1578     public final void orl(Register dst, Register src) {
1579         int encode = prefixAndEncode(dst.encoding, src.encoding);
1580         emitByte(0x0B);
1581         emitByte(0xC0 | encode);
1582     }
1583 
1584     public final void popcntl(Register dst, AMD64Address src) {
1585         assert supports(CPUFeature.POPCNT);
1586         emitByte(0xF3);
1587         prefix(src, dst);
1588         emitByte(0x0F);
1589         emitByte(0xB8);
1590         emitOperandHelper(dst, src);
1591     }
1592 
1593     public final void popcntl(Register dst, Register src) {
1594         assert supports(CPUFeature.POPCNT);
1595         emitByte(0xF3);
1596         int encode = prefixAndEncode(dst.encoding, src.encoding);
1597         emitByte(0x0F);
1598         emitByte(0xB8);
1599         emitByte(0xC0 | encode);
1600     }
1601 
1602     public final void popcntq(Register dst, AMD64Address src) {
1603         assert supports(CPUFeature.POPCNT);
1604         emitByte(0xF3);
1605         prefixq(src, dst);
1606         emitByte(0x0F);
1607         emitByte(0xB8);
1608         emitOperandHelper(dst, src);
1609     }
1610 
1611     public final void popcntq(Register dst, Register src) {
1612         assert supports(CPUFeature.POPCNT);
1613         emitByte(0xF3);
1614         int encode = prefixqAndEncode(dst.encoding, src.encoding);
1615         emitByte(0x0F);
1616         emitByte(0xB8);
1617         emitByte(0xC0 | encode);
1618     }
1619 
1620     public final void pop(Register dst) {
1621         int encode = prefixAndEncode(dst.encoding);
1622         emitByte(0x58 | encode);
1623     }
1624 
1625     public void popfq() {
1626         emitByte(0x9D);
1627     }
1628 
1629     public final void ptest(Register dst, Register src) {
1630         assert supports(CPUFeature.SSE4_1);
1631         emitByte(0x66);
1632         int encode = prefixAndEncode(dst.encoding, src.encoding);
1633         emitByte(0x0F);
1634         emitByte(0x38);
1635         emitByte(0x17);
1636         emitByte(0xC0 | encode);
1637     }
1638 
1639     public final void push(Register src) {
1640         int encode = prefixAndEncode(src.encoding);
1641         emitByte(0x50 | encode);
1642     }
1643 
1644     public void pushfq() {
1645         emitByte(0x9c);
1646     }
1647 
1648     public final void pxor(Register dst, Register src) {
1649         emitByte(0x66);
1650         int encode = prefixAndEncode(dst.encoding, src.encoding);
1651         emitByte(0x0F);
1652         emitByte(0xEF);
1653         emitByte(0xC0 | encode);
1654     }
1655 
1656     public final void ret(int imm16) {
1657         if (imm16 == 0) {
1658             emitByte(0xC3);
1659         } else {
1660             emitByte(0xC2);
1661             emitShort(imm16);
1662         }
1663     }
1664 
1665     public final void sarl(Register dst, int imm8) {
1666         int encode = prefixAndEncode(dst.encoding);
1667         assert isShiftCount(imm8) : "illegal shift count";
1668         if (imm8 == 1) {
1669             emitByte(0xD1);
1670             emitByte(0xF8 | encode);
1671         } else {
1672             emitByte(0xC1);
1673             emitByte(0xF8 | encode);
1674             emitByte(imm8);
1675         }
1676     }
1677 
1678     public final void sarl(Register dst) {
1679         int encode = prefixAndEncode(dst.encoding);
1680         emitByte(0xD3);
1681         emitByte(0xF8 | encode);
1682     }
1683 
1684     public final void shll(Register dst, int imm8) {
1685         assert isShiftCount(imm8) : "illegal shift count";
1686         int encode = prefixAndEncode(dst.encoding);
1687         if (imm8 == 1) {
1688             emitByte(0xD1);
1689             emitByte(0xE0 | encode);
1690         } else {
1691             emitByte(0xC1);
1692             emitByte(0xE0 | encode);
1693             emitByte(imm8);
1694         }
1695     }
1696 
1697     public final void shll(Register dst) {
1698         int encode = prefixAndEncode(dst.encoding);
1699         emitByte(0xD3);
1700         emitByte(0xE0 | encode);
1701     }
1702 
1703     public final void shrl(Register dst, int imm8) {
1704         assert isShiftCount(imm8) : "illegal shift count";
1705         int encode = prefixAndEncode(dst.encoding);
1706         if (imm8 == 1) {
1707             emitByte(0xD1);
1708             emitByte(0xE8 | encode);
1709         } else {
1710             emitByte(0xC1);
1711             emitByte(0xE8 | encode);
1712             emitByte(imm8);
1713         }
1714     }
1715 
1716     public final void shrl(Register dst) {
1717         int encode = prefixAndEncode(dst.encoding);
1718         emitByte(0xD3);
1719         emitByte(0xE8 | encode);
1720     }
1721 
1722     public final void roll(Register dst, int imm8) {
1723         assert isShiftCount(imm8) : "illegal shift count";
1724         int encode = prefixAndEncode(dst.encoding);
1725         if (imm8 == 1) {
1726             emitByte(0xD1);
1727             emitByte(0xC0 | encode);
1728         } else {
1729             emitByte(0xC1);
1730             emitByte(0xC0 | encode);
1731             emitByte(imm8);
1732         }
1733     }
1734 
1735     public final void roll(Register dst) {
1736         int encode = prefixAndEncode(dst.encoding);
1737         emitByte(0xD3);
1738         emitByte(0xC0 | encode);
1739     }
1740 
1741     public final void rorl(Register dst, int imm8) {
1742         assert isShiftCount(imm8) : "illegal shift count";
1743         int encode = prefixAndEncode(dst.encoding);
1744         if (imm8 == 1) {
1745             emitByte(0xD1);
1746             emitByte(0xC8 | encode);
1747         } else {
1748             emitByte(0xC1);
1749             emitByte(0xC8 | encode);
1750             emitByte(imm8);
1751         }
1752     }
1753 
1754     public final void rorl(Register dst) {
1755         int encode = prefixAndEncode(dst.encoding);
1756         emitByte(0xD3);
1757         emitByte(0xC8 | encode);
1758     }
1759 
1760     public final void rolq(Register dst, int imm8) {
1761         assert isShiftCount(imm8) : "illegal shift count";
1762         int encode = prefixqAndEncode(dst.encoding);
1763         if (imm8 == 1) {
1764             emitByte(0xD1);
1765             emitByte(0xC0 | encode);
1766         } else {
1767             emitByte(0xC1);
1768             emitByte(0xC0 | encode);
1769             emitByte(imm8);
1770         }
1771     }
1772 
1773     public final void rolq(Register dst) {
1774         int encode = prefixqAndEncode(dst.encoding);
1775         emitByte(0xD3);
1776         emitByte(0xC0 | encode);
1777     }
1778 
1779     public final void rorq(Register dst, int imm8) {
1780         assert isShiftCount(imm8) : "illegal shift count";
1781         int encode = prefixqAndEncode(dst.encoding);
1782         if (imm8 == 1) {
1783             emitByte(0xD1);
1784             emitByte(0xC8 | encode);
1785         } else {
1786             emitByte(0xC1);
1787             emitByte(0xC8 | encode);
1788             emitByte(imm8);
1789         }
1790     }
1791 
1792     public final void rorq(Register dst) {
1793         int encode = prefixqAndEncode(dst.encoding);
1794         emitByte(0xD3);
1795         emitByte(0xC8 | encode);
1796     }
1797 
1798     public final void sqrtsd(Register dst, AMD64Address src) {
1799         assert dst.getRegisterCategory().equals(AMD64.XMM);
1800         emitByte(0xF2);
1801         prefix(src, dst);
1802         emitByte(0x0F);
1803         emitByte(0x51);
1804         emitOperandHelper(dst, src);
1805     }
1806 
1807     public final void sqrtsd(Register dst, Register src) {
1808         assert dst.getRegisterCategory().equals(AMD64.XMM);
1809         assert src.getRegisterCategory().equals(AMD64.XMM);
1810         // HMM Table D-1 says sse2
1811         // assert is64 || target.supportsSSE();
1812         emitByte(0xF2);
1813         int encode = prefixAndEncode(dst.encoding, src.encoding);
1814         emitByte(0x0F);
1815         emitByte(0x51);
1816         emitByte(0xC0 | encode);
1817     }
1818 
1819     public final void subl(AMD64Address dst, int imm32) {
1820         emitArithImm32(5, dst, imm32);
1821     }
1822 
1823     public final void subl(Register dst, int imm32) {
1824         emitArithImm32(5, dst, imm32);
1825     }
1826 
1827     public final void subl(Register dst, AMD64Address src) {
1828         prefix(src, dst);
1829         emitByte(0x2B);
1830         emitOperandHelper(dst, src);
1831     }
1832 
1833     public final void subl(Register dst, Register src) {
1834         int encode = prefixAndEncode(dst.encoding, src.encoding);
1835         emitByte(0x2B);
1836         emitByte(0xC0 | encode);
1837     }
1838 
1839     public final void subsd(Register dst, Register src) {
1840         assert dst.getRegisterCategory().equals(AMD64.XMM);
1841         assert src.getRegisterCategory().equals(AMD64.XMM);
1842         emitByte(0xF2);
1843         int encode = prefixAndEncode(dst.encoding, src.encoding);
1844         emitByte(0x0F);
1845         emitByte(0x5C);
1846         emitByte(0xC0 | encode);
1847     }
1848 
1849     public final void subsd(Register dst, AMD64Address src) {
1850         assert dst.getRegisterCategory().equals(AMD64.XMM);
1851 
1852         emitByte(0xF2);
1853         prefix(src, dst);
1854         emitByte(0x0F);
1855         emitByte(0x5C);
1856         emitOperandHelper(dst, src);
1857     }
1858 
1859     public final void subss(Register dst, Register src) {
1860         assert dst.getRegisterCategory().equals(AMD64.XMM);
1861         assert src.getRegisterCategory().equals(AMD64.XMM);
1862         emitByte(0xF3);
1863         int encode = prefixAndEncode(dst.encoding, src.encoding);
1864         emitByte(0x0F);
1865         emitByte(0x5C);
1866         emitByte(0xC0 | encode);
1867     }
1868 
1869     public final void subss(Register dst, AMD64Address src) {
1870         assert dst.getRegisterCategory().equals(AMD64.XMM);
1871 
1872         emitByte(0xF3);
1873         prefix(src, dst);
1874         emitByte(0x0F);
1875         emitByte(0x5C);
1876         emitOperandHelper(dst, src);
1877     }
1878 
1879     public final void testl(Register dst, int imm32) {
1880         // not using emitArith because test
1881         // doesn't support sign-extension of
1882         // 8bit operands
1883         int encode = dst.encoding;
1884         if (encode == 0) {
1885             emitByte(0xA9);
1886         } else {
1887             encode = prefixAndEncode(encode);
1888             emitByte(0xF7);
1889             emitByte(0xC0 | encode);
1890         }
1891         emitInt(imm32);
1892     }
1893 
1894     public final void testl(AMD64Address dst, int imm32) {
1895         prefix(dst);
1896         emitByte(0xF7);
1897         emitOperandHelper(0, dst);
1898         emitInt(imm32);
1899     }
1900 
1901     public final void testl(Register dst, Register src) {
1902         int encode = prefixAndEncode(dst.encoding, src.encoding);
1903         emitByte(0x85);
1904         emitByte(0xC0 | encode);
1905     }
1906 
1907     public final void testl(Register dst, AMD64Address src) {
1908         prefix(src, dst);
1909         emitByte(0x85);
1910         emitOperandHelper(dst, src);
1911     }
1912 
1913     public final void tzcntl(Register dst, Register src) {
1914         assert supports(CPUFeature.BMI1);
1915         emitByte(0xF3);
1916         int encode = prefixAndEncode(dst.encoding, src.encoding);
1917         emitByte(0x0F);
1918         emitByte(0xBC);
1919         emitByte(0xC0 | encode);
1920     }
1921 
1922     public final void tzcntq(Register dst, Register src) {
1923         assert supports(CPUFeature.BMI1);
1924         emitByte(0xF3);
1925         int encode = prefixqAndEncode(dst.encoding, src.encoding);
1926         emitByte(0x0F);
1927         emitByte(0xBC);
1928         emitByte(0xC0 | encode);
1929     }
1930 
1931     public final void tzcntl(Register dst, AMD64Address src) {
1932         assert supports(CPUFeature.BMI1);
1933         emitByte(0xF3);
1934         prefix(src, dst);
1935         emitByte(0x0F);
1936         emitByte(0xBC);
1937         emitOperandHelper(dst, src);
1938     }
1939 
1940     public final void tzcntq(Register dst, AMD64Address src) {
1941         assert supports(CPUFeature.BMI1);
1942         emitByte(0xF3);
1943         prefixq(src, dst);
1944         emitByte(0x0F);
1945         emitByte(0xBC);
1946         emitOperandHelper(dst, src);
1947     }
1948 
1949     public final void ucomisd(Register dst, AMD64Address src) {
1950         assert dst.getRegisterCategory().equals(AMD64.XMM);
1951         emitByte(0x66);
1952         ucomiss(dst, src);
1953     }
1954 
1955     public final void ucomisd(Register dst, Register src) {
1956         assert dst.getRegisterCategory().equals(AMD64.XMM);
1957         assert src.getRegisterCategory().equals(AMD64.XMM);
1958         emitByte(0x66);
1959         ucomiss(dst, src);
1960     }
1961 
1962     public final void ucomiss(Register dst, AMD64Address src) {
1963         assert dst.getRegisterCategory().equals(AMD64.XMM);
1964 
1965         prefix(src, dst);
1966         emitByte(0x0F);
1967         emitByte(0x2E);
1968         emitOperandHelper(dst, src);
1969     }
1970 
1971     public final void ucomiss(Register dst, Register src) {
1972         assert dst.getRegisterCategory().equals(AMD64.XMM);
1973         assert src.getRegisterCategory().equals(AMD64.XMM);
1974         int encode = prefixAndEncode(dst.encoding, src.encoding);
1975         emitByte(0x0F);
1976         emitByte(0x2E);
1977         emitByte(0xC0 | encode);
1978     }
1979 
1980     public final void xorl(Register dst, int imm32) {
1981         emitArithImm32(6, dst, imm32);
1982     }
1983 
1984     public final void xorl(Register dst, AMD64Address src) {
1985         prefix(src, dst);
1986         emitByte(0x33);
1987         emitOperandHelper(dst, src);
1988     }
1989 
1990     public final void xorl(Register dst, Register src) {
1991         int encode = prefixAndEncode(dst.encoding, src.encoding);
1992         emitByte(0x33);
1993         emitByte(0xC0 | encode);
1994     }
1995 
1996     public final void andpd(Register dst, Register src) {
1997         emitByte(0x66);
1998         andps(dst, src);
1999     }
2000 
2001     public final void andpd(Register dst, AMD64Address src) {
2002         emitByte(0x66);
2003         andps(dst, src);
2004     }
2005 
2006     public final void andps(Register dst, Register src) {
2007         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
2008         int encode = prefixAndEncode(dst.encoding, src.encoding);
2009         emitByte(0x0F);
2010         emitByte(0x54);
2011         emitByte(0xC0 | encode);
2012     }
2013 
2014     public final void andps(Register dst, AMD64Address src) {
2015         assert dst.getRegisterCategory().equals(AMD64.XMM);
2016         prefix(src, dst);
2017         emitByte(0x0F);
2018         emitByte(0x54);
2019         emitOperandHelper(dst, src);
2020     }
2021 
2022     public final void orpd(Register dst, Register src) {
2023         emitByte(0x66);
2024         orps(dst, src);
2025     }
2026 
2027     public final void orpd(Register dst, AMD64Address src) {
2028         emitByte(0x66);
2029         orps(dst, src);
2030     }
2031 
2032     public final void orps(Register dst, Register src) {
2033         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
2034         int encode = prefixAndEncode(dst.encoding, src.encoding);
2035         emitByte(0x0F);
2036         emitByte(0x56);
2037         emitByte(0xC0 | encode);
2038     }
2039 
2040     public final void orps(Register dst, AMD64Address src) {
2041         assert dst.getRegisterCategory().equals(AMD64.XMM);
2042         prefix(src, dst);
2043         emitByte(0x0F);
2044         emitByte(0x56);
2045         emitOperandHelper(dst, src);
2046     }
2047 
2048     public final void xorpd(Register dst, Register src) {
2049         emitByte(0x66);
2050         xorps(dst, src);
2051     }
2052 
2053     public final void xorpd(Register dst, AMD64Address src) {
2054         emitByte(0x66);
2055         xorps(dst, src);
2056     }
2057 
2058     public final void xorps(Register dst, Register src) {
2059         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
2060         int encode = prefixAndEncode(dst.encoding, src.encoding);
2061         emitByte(0x0F);
2062         emitByte(0x57);
2063         emitByte(0xC0 | encode);
2064     }
2065 
2066     public final void xorps(Register dst, AMD64Address src) {
2067         assert dst.getRegisterCategory().equals(AMD64.XMM);
2068         prefix(src, dst);
2069         emitByte(0x0F);
2070         emitByte(0x57);
2071         emitOperandHelper(dst, src);
2072     }
2073 
2074     protected final void decl(Register dst) {
2075         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2076         int encode = prefixAndEncode(dst.encoding);
2077         emitByte(0xFF);
2078         emitByte(0xC8 | encode);
2079     }
2080 
2081     protected final void incl(Register dst) {
2082         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2083         int encode = prefixAndEncode(dst.encoding);
2084         emitByte(0xFF);
2085         emitByte(0xC0 | encode);
2086     }
2087 
2088     private int prefixAndEncode(int regEnc) {
2089         return prefixAndEncode(regEnc, false);
2090     }
2091 
2092     private int prefixAndEncode(int regEnc, boolean byteinst) {
2093         if (regEnc >= 8) {
2094             emitByte(Prefix.REXB);
2095             return regEnc - 8;
2096         } else if (byteinst && regEnc >= 4) {
2097             emitByte(Prefix.REX);
2098         }
2099         return regEnc;
2100     }
2101 
2102     private int prefixqAndEncode(int regEnc) {
2103         if (regEnc < 8) {
2104             emitByte(Prefix.REXW);
2105             return regEnc;
2106         } else {
2107             emitByte(Prefix.REXWB);
2108             return regEnc - 8;
2109         }
2110     }
2111 
2112     private int prefixAndEncode(int dstEnc, int srcEnc) {
2113         return prefixAndEncode(dstEnc, false, srcEnc, false);
2114     }
2115 
2116     private int prefixAndEncode(int dstEncoding, boolean dstIsByte, int srcEncoding, boolean srcIsByte) {
2117         int srcEnc = srcEncoding;
2118         int dstEnc = dstEncoding;
2119         if (dstEnc < 8) {
2120             if (srcEnc >= 8) {
2121                 emitByte(Prefix.REXB);
2122                 srcEnc -= 8;
2123             } else if ((srcIsByte && srcEnc >= 4) || (dstIsByte && dstEnc >= 4)) {
2124                 emitByte(Prefix.REX);
2125             }
2126         } else {
2127             if (srcEnc < 8) {
2128                 emitByte(Prefix.REXR);
2129             } else {
2130                 emitByte(Prefix.REXRB);
2131                 srcEnc -= 8;
2132             }
2133             dstEnc -= 8;
2134         }
2135         return dstEnc << 3 | srcEnc;
2136     }
2137 
2138     /**
2139      * Creates prefix and the encoding of the lower 6 bits of the ModRM-Byte. It emits an operand
2140      * prefix. If the given operands exceed 3 bits, the 4th bit is encoded in the prefix.
2141      *
2142      * @param regEncoding the encoding of the register part of the ModRM-Byte
2143      * @param rmEncoding the encoding of the r/m part of the ModRM-Byte
2144      * @return the lower 6 bits of the ModRM-Byte that should be emitted
2145      */
2146     private int prefixqAndEncode(int regEncoding, int rmEncoding) {
2147         int rmEnc = rmEncoding;
2148         int regEnc = regEncoding;
2149         if (regEnc < 8) {
2150             if (rmEnc < 8) {
2151                 emitByte(Prefix.REXW);
2152             } else {
2153                 emitByte(Prefix.REXWB);
2154                 rmEnc -= 8;
2155             }
2156         } else {
2157             if (rmEnc < 8) {
2158                 emitByte(Prefix.REXWR);
2159             } else {
2160                 emitByte(Prefix.REXWRB);
2161                 rmEnc -= 8;
2162             }
2163             regEnc -= 8;
2164         }
2165         return regEnc << 3 | rmEnc;
2166     }
2167 
2168     private static boolean needsRex(Register reg) {
2169         return reg.encoding >= MinEncodingNeedsRex;
2170     }
2171 
2172     private void prefix(AMD64Address adr) {
2173         if (needsRex(adr.getBase())) {
2174             if (needsRex(adr.getIndex())) {
2175                 emitByte(Prefix.REXXB);
2176             } else {
2177                 emitByte(Prefix.REXB);
2178             }
2179         } else {
2180             if (needsRex(adr.getIndex())) {
2181                 emitByte(Prefix.REXX);
2182             }
2183         }
2184     }
2185 
2186     private void prefixq(AMD64Address adr) {
2187         if (needsRex(adr.getBase())) {
2188             if (needsRex(adr.getIndex())) {
2189                 emitByte(Prefix.REXWXB);
2190             } else {
2191                 emitByte(Prefix.REXWB);
2192             }
2193         } else {
2194             if (needsRex(adr.getIndex())) {
2195                 emitByte(Prefix.REXWX);
2196             } else {
2197                 emitByte(Prefix.REXW);
2198             }
2199         }
2200     }
2201 
2202     private void prefix(AMD64Address adr, Register reg) {
2203         prefix(adr, reg, false);
2204     }
2205 
2206     private void prefix(AMD64Address adr, Register reg, boolean byteinst) {
2207         if (reg.encoding < 8) {
2208             if (needsRex(adr.getBase())) {
2209                 if (needsRex(adr.getIndex())) {
2210                     emitByte(Prefix.REXXB);
2211                 } else {
2212                     emitByte(Prefix.REXB);
2213                 }
2214             } else {
2215                 if (needsRex(adr.getIndex())) {
2216                     emitByte(Prefix.REXX);
2217                 } else if (byteinst && reg.encoding >= 4) {
2218                     emitByte(Prefix.REX);
2219                 }
2220             }
2221         } else {
2222             if (needsRex(adr.getBase())) {
2223                 if (needsRex(adr.getIndex())) {
2224                     emitByte(Prefix.REXRXB);
2225                 } else {
2226                     emitByte(Prefix.REXRB);
2227                 }
2228             } else {
2229                 if (needsRex(adr.getIndex())) {
2230                     emitByte(Prefix.REXRX);
2231                 } else {
2232                     emitByte(Prefix.REXR);
2233                 }
2234             }
2235         }
2236     }
2237 
2238     private void prefixq(AMD64Address adr, Register src) {
2239         if (src.encoding < 8) {
2240             if (needsRex(adr.getBase())) {
2241                 if (needsRex(adr.getIndex())) {
2242                     emitByte(Prefix.REXWXB);
2243                 } else {
2244                     emitByte(Prefix.REXWB);
2245                 }
2246             } else {
2247                 if (needsRex(adr.getIndex())) {
2248                     emitByte(Prefix.REXWX);
2249                 } else {
2250                     emitByte(Prefix.REXW);
2251                 }
2252             }
2253         } else {
2254             if (needsRex(adr.getBase())) {
2255                 if (needsRex(adr.getIndex())) {
2256                     emitByte(Prefix.REXWRXB);
2257                 } else {
2258                     emitByte(Prefix.REXWRB);
2259                 }
2260             } else {
2261                 if (needsRex(adr.getIndex())) {
2262                     emitByte(Prefix.REXWRX);
2263                 } else {
2264                     emitByte(Prefix.REXWR);
2265                 }
2266             }
2267         }
2268     }
2269 
2270     public final void addq(Register dst, int imm32) {
2271         emitArithImm32q(0, dst, imm32);
2272     }
2273 
2274     public final void addq(Register dst, AMD64Address src) {
2275         prefixq(src, dst);
2276         emitByte(0x03);
2277         emitOperandHelper(dst, src);
2278     }
2279 
2280     public final void addq(Register dst, Register src) {
2281         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2282         emitByte(0x03);
2283         emitByte(0xC0 | encode);
2284     }
2285 
2286     public final void andq(Register dst, int imm32) {
2287         emitArithImm32q(4, dst, imm32);
2288     }
2289 
2290     public final void andq(Register dst, AMD64Address src) {
2291         prefixq(src, dst);
2292         emitByte(0x23);
2293         emitOperandHelper(dst, src);
2294     }
2295 
2296     public final void andq(Register dst, Register src) {
2297         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2298         emitByte(0x23);
2299         emitByte(0xC0 | encode);
2300     }
2301 
2302     public final void bswapq(Register reg) {
2303         int encode = prefixqAndEncode(reg.encoding);
2304         emitByte(0x0F);
2305         emitByte(0xC8 | encode);
2306     }
2307 
2308     public final void cdqq() {
2309         emitByte(Prefix.REXW);
2310         emitByte(0x99);
2311     }
2312 
2313     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
2314         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2315         emitByte(0x0F);
2316         emitByte(0x40 | cc.getValue());
2317         emitByte(0xC0 | encode);
2318     }
2319 
2320     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
2321         prefixq(src, dst);
2322         emitByte(0x0F);
2323         emitByte(0x40 | cc.getValue());
2324         emitOperandHelper(dst, src);
2325     }
2326 
2327     public final void cmpq(AMD64Address dst, int imm32) {
2328         prefixq(dst);
2329         emitByte(0x81);
2330         emitOperandHelper(7, dst);
2331         emitInt(imm32);
2332     }
2333 
2334     public final void cmpq(Register dst, int imm32) {
2335         emitArithImm32q(7, dst, imm32);
2336     }
2337 
2338     public final void cmpq(Register dst, Register src) {
2339         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2340         emitByte(0x3B);
2341         emitByte(0xC0 | encode);
2342     }
2343 
2344     public final void cmpq(Register dst, AMD64Address src) {
2345         prefixq(src, dst);
2346         emitByte(0x3B);
2347         emitOperandHelper(dst, src);
2348     }
2349 
2350     public final void cmpxchgq(Register reg, AMD64Address adr) {
2351         prefixq(adr, reg);
2352         emitByte(0x0F);
2353         emitByte(0xB1);
2354         emitOperandHelper(reg, adr);
2355     }
2356 
2357     public final void cvtsi2sdq(Register dst, AMD64Address src) {
2358         assert dst.getRegisterCategory().equals(AMD64.XMM);
2359         emitByte(0xF2);
2360         prefixq(src, dst);
2361         emitByte(0x0F);
2362         emitByte(0x2A);
2363         emitOperandHelper(dst, src);
2364     }
2365 
2366     public final void cvtsi2sdq(Register dst, Register src) {
2367         assert dst.getRegisterCategory().equals(AMD64.XMM);
2368         emitByte(0xF2);
2369         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2370         emitByte(0x0F);
2371         emitByte(0x2A);
2372         emitByte(0xC0 | encode);
2373     }
2374 
2375     public final void cvtsi2ssq(Register dst, AMD64Address src) {
2376         assert dst.getRegisterCategory().equals(AMD64.XMM);
2377         emitByte(0xF3);
2378         prefixq(src, dst);
2379         emitByte(0x0F);
2380         emitByte(0x2A);
2381         emitOperandHelper(dst, src);
2382     }
2383 
2384     public final void cvtsi2ssq(Register dst, Register src) {
2385         assert dst.getRegisterCategory().equals(AMD64.XMM);
2386         emitByte(0xF3);
2387         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2388         emitByte(0x0F);
2389         emitByte(0x2A);
2390         emitByte(0xC0 | encode);
2391     }
2392 
2393     public final void cvttsd2siq(Register dst, AMD64Address src) {
2394         emitByte(0xF2);
2395         prefixq(src, dst);
2396         emitByte(0x0F);
2397         emitByte(0x2C);
2398         emitOperandHelper(dst, src);
2399     }
2400 
2401     public final void cvttsd2siq(Register dst, Register src) {
2402         assert src.getRegisterCategory().equals(AMD64.XMM);
2403         emitByte(0xF2);
2404         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2405         emitByte(0x0F);
2406         emitByte(0x2C);
2407         emitByte(0xC0 | encode);
2408     }
2409 
2410     public final void cvttss2siq(Register dst, AMD64Address src) {
2411         emitByte(0xF3);
2412         prefixq(src, dst);
2413         emitByte(0x0F);
2414         emitByte(0x2C);
2415         emitOperandHelper(dst, src);
2416     }
2417 
2418     public final void cvttss2siq(Register dst, Register src) {
2419         assert src.getRegisterCategory().equals(AMD64.XMM);
2420         emitByte(0xF3);
2421         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2422         emitByte(0x0F);
2423         emitByte(0x2C);
2424         emitByte(0xC0 | encode);
2425     }
2426 
2427     protected final void decq(Register dst) {
2428         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2429         int encode = prefixqAndEncode(dst.encoding);
2430         emitByte(0xFF);
2431         emitByte(0xC8 | encode);
2432     }
2433 
2434     protected final void decq(AMD64Address dst) {
2435         prefixq(dst);
2436         emitByte(0xFF);
2437         emitOperandHelper(1, dst);
2438     }
2439 
2440     public final void divq(Register src) {
2441         int encode = prefixqAndEncode(6, src.encoding);
2442         emitByte(0xF7);
2443         emitByte(0xC0 | encode);
2444     }
2445 
2446     public final void idivq(Register src) {
2447         int encode = prefixqAndEncode(7, src.encoding);
2448         emitByte(0xF7);
2449         emitByte(0xC0 | encode);
2450     }
2451 
2452     public final void mulq(Register src) {
2453         int encode = prefixqAndEncode(4, src.encoding);
2454         emitByte(0xF7);
2455         emitByte(0xC0 | encode);
2456     }
2457 
2458     public final void mulq(AMD64Address src) {
2459         prefixq(src);
2460         emitByte(0xF7);
2461         emitOperandHelper(4, src);
2462     }
2463 
2464     public final void imulq(Register src) {
2465         int encode = prefixqAndEncode(5, src.encoding);
2466         emitByte(0xF7);
2467         emitByte(0xC0 | encode);
2468     }
2469 
2470     public final void imulq(AMD64Address src) {
2471         prefixq(src);
2472         emitByte(0xF7);
2473         emitOperandHelper(5, src);
2474     }
2475 
2476     public final void imulq(Register dst, Register src) {
2477         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2478         emitByte(0x0F);
2479         emitByte(0xAF);
2480         emitByte(0xC0 | encode);
2481     }
2482 
2483     public final void imulq(Register dst, AMD64Address src) {
2484         prefixq(src, dst);
2485         emitByte(0x0F);
2486         emitByte(0xAF);
2487         emitOperandHelper(dst, src);
2488     }
2489 
2490     public final void imulq(Register dst, Register src, int value) {
2491         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2492         if (isByte(value)) {
2493             emitByte(0x6B);
2494             emitByte(0xC0 | encode);
2495             emitByte(value & 0xFF);
2496         } else {
2497             emitByte(0x69);
2498             emitByte(0xC0 | encode);
2499             emitInt(value);
2500         }
2501     }
2502 
2503     public final void incq(Register dst) {
2504         // Don't use it directly. Use Macroincrementq() instead.
2505         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2506         int encode = prefixqAndEncode(dst.encoding);
2507         emitByte(0xFF);
2508         emitByte(0xC0 | encode);
2509     }
2510 
2511     public final void movq(Register dst, long imm64) {
2512         int encode = prefixqAndEncode(dst.encoding);
2513         emitByte(0xB8 | encode);
2514         emitLong(imm64);
2515     }
2516 
2517     public final void movslq(Register dst, int imm32) {
2518         int encode = prefixqAndEncode(dst.encoding);
2519         emitByte(0xC7);
2520         emitByte(0xC0 | encode);
2521         emitInt(imm32);
2522     }
2523 
2524     public final void movdq(Register dst, Register src) {
2525 
2526         // table D-1 says MMX/SSE2
2527         emitByte(0x66);
2528 
2529         if (dst.getRegisterCategory().equals(AMD64.XMM)) {
2530             int encode = prefixqAndEncode(dst.encoding, src.encoding);
2531             emitByte(0x0F);
2532             emitByte(0x6E);
2533             emitByte(0xC0 | encode);
2534         } else if (src.getRegisterCategory().equals(AMD64.XMM)) {
2535 
2536             // swap src/dst to get correct prefix
2537             int encode = prefixqAndEncode(src.encoding, dst.encoding);
2538             emitByte(0x0F);
2539             emitByte(0x7E);
2540             emitByte(0xC0 | encode);
2541         } else {
2542             throw new InternalError("should not reach here");
2543         }
2544     }
2545 
2546     public final void movdqu(Register dst, AMD64Address src) {
2547         emitByte(0xF3);
2548         prefix(src, dst);
2549         emitByte(0x0F);
2550         emitByte(0x6F);
2551         emitOperandHelper(dst, src);
2552     }
2553 
2554     public final void movslq(AMD64Address dst, int imm32) {
2555         prefixq(dst);
2556         emitByte(0xC7);
2557         emitOperandHelper(0, dst);
2558         emitInt(imm32);
2559     }
2560 
2561     public final void movslq(Register dst, AMD64Address src) {
2562         prefixq(src, dst);
2563         emitByte(0x63);
2564         emitOperandHelper(dst, src);
2565     }
2566 
2567     public final void movslq(Register dst, Register src) {
2568         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2569         emitByte(0x63);
2570         emitByte(0xC0 | encode);
2571     }
2572 
2573     public final void negq(Register dst) {
2574         int encode = prefixqAndEncode(dst.encoding);
2575         emitByte(0xF7);
2576         emitByte(0xD8 | encode);
2577     }
2578 
2579     public final void notq(Register dst) {
2580         int encode = prefixqAndEncode(dst.encoding);
2581         emitByte(0xF7);
2582         emitByte(0xD0 | encode);
2583     }
2584 
2585     public final void orq(Register dst, int imm32) {
2586         emitArithImm32q(1, dst, imm32);
2587     }
2588 
2589     public final void orq(Register dst, AMD64Address src) {
2590         prefixq(src, dst);
2591         emitByte(0x0B);
2592         emitOperandHelper(dst, src);
2593     }
2594 
2595     public final void orq(Register dst, Register src) {
2596         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2597         emitByte(0x0B);
2598         emitByte(0xC0 | encode);
2599     }
2600 
2601     public final void sarq(Register dst, int imm8) {
2602         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2603         int encode = prefixqAndEncode(dst.encoding);
2604         if (imm8 == 1) {
2605             emitByte(0xD1);
2606             emitByte(0xF8 | encode);
2607         } else {
2608             emitByte(0xC1);
2609             emitByte(0xF8 | encode);
2610             emitByte(imm8);
2611         }
2612     }
2613 
2614     public final void sarq(Register dst) {
2615         int encode = prefixqAndEncode(dst.encoding);
2616         emitByte(0xD3);
2617         emitByte(0xF8 | encode);
2618     }
2619 
2620     public final void shlq(Register dst, int imm8) {
2621         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2622         int encode = prefixqAndEncode(dst.encoding);
2623         if (imm8 == 1) {
2624             emitByte(0xD1);
2625             emitByte(0xE0 | encode);
2626         } else {
2627             emitByte(0xC1);
2628             emitByte(0xE0 | encode);
2629             emitByte(imm8);
2630         }
2631     }
2632 
2633     public final void shlq(Register dst) {
2634         int encode = prefixqAndEncode(dst.encoding);
2635         emitByte(0xD3);
2636         emitByte(0xE0 | encode);
2637     }
2638 
2639     public final void shrq(Register dst, int imm8) {
2640         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2641         int encode = prefixqAndEncode(dst.encoding);
2642         if (imm8 == 1) {
2643             emitByte(0xD1);
2644             emitByte(0xE8 | encode);
2645         } else {
2646             emitByte(0xC1);
2647             emitByte(0xE8 | encode);
2648             emitByte(imm8);
2649         }
2650     }
2651 
2652     public final void shrq(Register dst) {
2653         int encode = prefixqAndEncode(dst.encoding);
2654         emitByte(0xD3);
2655         emitByte(0xE8 | encode);
2656     }
2657 
2658     public final void subq(Register dst, int imm32) {
2659         subq(dst, imm32, false);
2660     }
2661 
2662     public final void subqWide(Register dst, int imm32) {
2663         subq(dst, imm32, true);
2664     }
2665 
2666     private void subq(Register dst, int imm32, boolean force32Imm) {
2667         emitArithImm32q(5, dst, imm32, force32Imm);
2668     }
2669 
2670     public final void subq(Register dst, AMD64Address src) {
2671         prefixq(src, dst);
2672         emitByte(0x2B);
2673         emitOperandHelper(dst, src);
2674     }
2675 
2676     public final void subq(Register dst, Register src) {
2677         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2678         emitByte(0x2B);
2679         emitByte(0xC0 | encode);
2680     }
2681 
2682     public final void testq(Register dst, int imm32) {
2683         // not using emitArith because test
2684         // doesn't support sign-extension of
2685         // 8bit operands
2686         int encode = dst.encoding;
2687         if (encode == 0) {
2688             emitByte(Prefix.REXW);
2689             emitByte(0xA9);
2690         } else {
2691             encode = prefixqAndEncode(encode);
2692             emitByte(0xF7);
2693             emitByte(0xC0 | encode);
2694         }
2695         emitInt(imm32);
2696     }
2697 
2698     public final void testq(Register dst, Register src) {
2699         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2700         emitByte(0x85);
2701         emitByte(0xC0 | encode);
2702     }
2703 
2704     public final void testq(Register dst, AMD64Address src) {
2705         prefixq(src, dst);
2706         emitByte(0x85);
2707         emitOperandHelper(dst, src);
2708     }
2709 
2710     public final void testq(AMD64Address dst, int imm32) {
2711         prefixq(dst);
2712         emitByte(0xF7);
2713         emitOperandHelper(0, dst);
2714         emitInt(imm32);
2715     }
2716 
2717     public final void xaddl(AMD64Address dst, Register src) {
2718         prefix(dst, src);
2719         emitByte(0x0F);
2720         emitByte(0xC1);
2721         emitOperandHelper(src, dst);
2722     }
2723 
2724     public final void xaddq(AMD64Address dst, Register src) {
2725         prefixq(dst, src);
2726         emitByte(0x0F);
2727         emitByte(0xC1);
2728         emitOperandHelper(src, dst);
2729     }
2730 
2731     public final void xchgl(Register dst, AMD64Address src) {
2732         prefix(src, dst);
2733         emitByte(0x87);
2734         emitOperandHelper(dst, src);
2735     }
2736 
2737     public final void xchgq(Register dst, AMD64Address src) {
2738         prefixq(src, dst);
2739         emitByte(0x87);
2740         emitOperandHelper(dst, src);
2741     }
2742 
2743     public final void xorq(Register dst, int imm32) {
2744         emitArithImm32q(6, dst, imm32);
2745     }
2746 
2747     public final void xorq(Register dst, Register src) {
2748         int encode = prefixqAndEncode(dst.encoding, src.encoding);
2749         emitByte(0x33);
2750         emitByte(0xC0 | encode);
2751     }
2752 
2753     public final void xorq(Register dst, AMD64Address src) {
2754         prefixq(src, dst);
2755         emitByte(0x33);
2756         emitOperandHelper(dst, src);
2757     }
2758 
2759     public final void membar(int barriers) {
2760         if (target.isMP) {
2761             // We only have to handle StoreLoad
2762             if ((barriers & STORE_LOAD) != 0) {
2763                 // All usable chips support "locked" instructions which suffice
2764                 // as barriers, and are much faster than the alternative of
2765                 // using cpuid instruction. We use here a locked add [rsp],0.
2766                 // This is conveniently otherwise a no-op except for blowing
2767                 // flags.
2768                 // Any change to this code may need to revisit other places in
2769                 // the code where this idiom is used, in particular the
2770                 // orderAccess code.
2771                 lock();
2772                 addl(new AMD64Address(rsp, 0), 0); // Assert the lock# signal here
2773             }
2774         }
2775     }
2776 
2777     @Override
2778     protected final void patchJumpTarget(int branch, int branchTarget) {
2779         int op = getByte(branch);
2780         assert op == 0xE8 // call
2781                         ||
2782                         op == 0x00 // jump table entry
2783                         || op == 0xE9 // jmp
2784                         || op == 0xEB // short jmp
2785                         || (op & 0xF0) == 0x70 // short jcc
2786                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
2787         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
2788 
2789         if (op == 0x00) {
2790             int offsetToJumpTableBase = getShort(branch + 1);
2791             int jumpTableBase = branch - offsetToJumpTableBase;
2792             int imm32 = branchTarget - jumpTableBase;
2793             emitInt(imm32, branch);
2794         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
2795 
2796             // short offset operators (jmp and jcc)
2797             final int imm8 = branchTarget - (branch + 2);
2798             /*
2799              * Since a wrongly patched short branch can potentially lead to working but really bad
2800              * behaving code we should always fail with an exception instead of having an assert.
2801              */
2802             if (!NumUtil.isByte(imm8)) {
2803                 throw new InternalError("branch displacement out of range: " + imm8);
2804             }
2805             emitByte(imm8, branch + 1);
2806 
2807         } else {
2808 
2809             int off = 1;
2810             if (op == 0x0F) {
2811                 off = 2;
2812             }
2813 
2814             int imm32 = branchTarget - (branch + 4 + off);
2815             emitInt(imm32, branch + off);
2816         }
2817     }
2818 
2819     public void nullCheck(Register r) {
2820         testl(AMD64.rax, new AMD64Address(r, 0));
2821     }
2822 
2823     @Override
2824     public void align(int modulus) {
2825         if (position() % modulus != 0) {
2826             nop(modulus - (position() % modulus));
2827         }
2828     }
2829 
2830     /**
2831      * Emits a direct call instruction. Note that the actual call target is not specified, because
2832      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
2833      * responsible to add the call address to the appropriate patching tables.
2834      */
2835     public final void call() {
2836         emitByte(0xE8);
2837         emitInt(0);
2838     }
2839 
2840     public final void call(Register src) {
2841         int encode = prefixAndEncode(src.encoding);
2842         emitByte(0xFF);
2843         emitByte(0xD0 | encode);
2844     }
2845 
2846     public final void int3() {
2847         emitByte(0xCC);
2848     }
2849 
2850     private void emitx87(int b1, int b2, int i) {
2851         assert 0 <= i && i < 8 : "illegal stack offset";
2852         emitByte(b1);
2853         emitByte(b2 + i);
2854     }
2855 
2856     public final void fldd(AMD64Address src) {
2857         emitByte(0xDD);
2858         emitOperandHelper(0, src);
2859     }
2860 
2861     public final void flds(AMD64Address src) {
2862         emitByte(0xD9);
2863         emitOperandHelper(0, src);
2864     }
2865 
2866     public final void fldln2() {
2867         emitByte(0xD9);
2868         emitByte(0xED);
2869     }
2870 
2871     public final void fldlg2() {
2872         emitByte(0xD9);
2873         emitByte(0xEC);
2874     }
2875 
2876     public final void fyl2x() {
2877         emitByte(0xD9);
2878         emitByte(0xF1);
2879     }
2880 
2881     public final void fstps(AMD64Address src) {
2882         emitByte(0xD9);
2883         emitOperandHelper(3, src);
2884     }
2885 
2886     public final void fstpd(AMD64Address src) {
2887         emitByte(0xDD);
2888         emitOperandHelper(3, src);
2889     }
2890 
2891     private void emitFPUArith(int b1, int b2, int i) {
2892         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
2893         emitByte(b1);
2894         emitByte(b2 + i);
2895     }
2896 
2897     public void ffree(int i) {
2898         emitFPUArith(0xDD, 0xC0, i);
2899     }
2900 
2901     public void fincstp() {
2902         emitByte(0xD9);
2903         emitByte(0xF7);
2904     }
2905 
2906     public void fxch(int i) {
2907         emitFPUArith(0xD9, 0xC8, i);
2908     }
2909 
2910     public void fnstswAX() {
2911         emitByte(0xDF);
2912         emitByte(0xE0);
2913     }
2914 
2915     public void fwait() {
2916         emitByte(0x9B);
2917     }
2918 
2919     public void fprem() {
2920         emitByte(0xD9);
2921         emitByte(0xF8);
2922     }
2923 
2924     public final void fsin() {
2925         emitByte(0xD9);
2926         emitByte(0xFE);
2927     }
2928 
2929     public final void fcos() {
2930         emitByte(0xD9);
2931         emitByte(0xFF);
2932     }
2933 
2934     public final void fptan() {
2935         emitByte(0xD9);
2936         emitByte(0xF2);
2937     }
2938 
2939     public final void fstp(int i) {
2940         emitx87(0xDD, 0xD8, i);
2941     }
2942 
2943     @Override
2944     public AMD64Address makeAddress(Register base, int displacement) {
2945         return new AMD64Address(base, displacement);
2946     }
2947 
2948     @Override
2949     public AMD64Address getPlaceholder() {
2950         return Placeholder;
2951     }
2952 
2953     private void prefetchPrefix(AMD64Address src) {
2954         prefix(src);
2955         emitByte(0x0F);
2956     }
2957 
2958     public void prefetchnta(AMD64Address src) {
2959         prefetchPrefix(src);
2960         emitByte(0x18);
2961         emitOperandHelper(0, src);
2962     }
2963 
2964     void prefetchr(AMD64Address src) {
2965         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
2966         prefetchPrefix(src);
2967         emitByte(0x0D);
2968         emitOperandHelper(0, src);
2969     }
2970 
2971     public void prefetcht0(AMD64Address src) {
2972         assert supports(CPUFeature.SSE);
2973         prefetchPrefix(src);
2974         emitByte(0x18);
2975         emitOperandHelper(1, src);
2976     }
2977 
2978     public void prefetcht1(AMD64Address src) {
2979         assert supports(CPUFeature.SSE);
2980         prefetchPrefix(src);
2981         emitByte(0x18);
2982         emitOperandHelper(2, src);
2983     }
2984 
2985     public void prefetcht2(AMD64Address src) {
2986         assert supports(CPUFeature.SSE);
2987         prefix(src);
2988         emitByte(0x0f);
2989         emitByte(0x18);
2990         emitOperandHelper(3, src);
2991     }
2992 
2993     public void prefetchw(AMD64Address src) {
2994         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
2995         prefix(src);
2996         emitByte(0x0f);
2997         emitByte(0x0D);
2998         emitOperandHelper(1, src);
2999     }
3000 
3001 }