1 /*
   2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  67 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  68 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  69 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  70 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  71 
  72 import java.util.EnumSet;
  73 
  74 import org.graalvm.compiler.asm.Label;
  75 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  76 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  77 import org.graalvm.compiler.core.common.NumUtil;
  78 import org.graalvm.compiler.core.common.calc.Condition;
  79 import org.graalvm.compiler.debug.GraalError;
  80 
  81 import jdk.vm.ci.amd64.AMD64;
  82 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  83 import jdk.vm.ci.code.Register;
  84 import jdk.vm.ci.code.Register.RegisterCategory;
  85 import jdk.vm.ci.code.TargetDescription;
  86 
  87 /**
  88  * This class implements an assembler that can encode most X86 instructions.
  89  */
  90 public class AMD64Assembler extends AMD64BaseAssembler {
  91 
  92     /**
  93      * Constructs an assembler for the AMD64 architecture.
  94      */
  95     public AMD64Assembler(TargetDescription target) {
  96         super(target);
  97     }
  98 
  99     /**
 100      * The x86 condition codes used for conditional jumps/moves.
 101      */
 102     public enum ConditionFlag {
 103         Zero(0x4, "|zero|"),
 104         NotZero(0x5, "|nzero|"),
 105         Equal(0x4, "="),
 106         NotEqual(0x5, "!="),
 107         Less(0xc, "<"),
 108         LessEqual(0xe, "<="),
 109         Greater(0xf, ">"),
 110         GreaterEqual(0xd, ">="),
 111         Below(0x2, "|<|"),
 112         BelowEqual(0x6, "|<=|"),
 113         Above(0x7, "|>|"),
 114         AboveEqual(0x3, "|>=|"),
 115         Overflow(0x0, "|of|"),
 116         NoOverflow(0x1, "|nof|"),
 117         CarrySet(0x2, "|carry|"),
 118         CarryClear(0x3, "|ncarry|"),
 119         Negative(0x8, "|neg|"),
 120         Positive(0x9, "|pos|"),
 121         Parity(0xa, "|par|"),
 122         NoParity(0xb, "|npar|");
 123 
 124         private final int value;
 125         private final String operator;
 126 
 127         ConditionFlag(int value, String operator) {
 128             this.value = value;
 129             this.operator = operator;
 130         }
 131 
 132         public ConditionFlag negate() {
 133             switch (this) {
 134                 case Zero:
 135                     return NotZero;
 136                 case NotZero:
 137                     return Zero;
 138                 case Equal:
 139                     return NotEqual;
 140                 case NotEqual:
 141                     return Equal;
 142                 case Less:
 143                     return GreaterEqual;
 144                 case LessEqual:
 145                     return Greater;
 146                 case Greater:
 147                     return LessEqual;
 148                 case GreaterEqual:
 149                     return Less;
 150                 case Below:
 151                     return AboveEqual;
 152                 case BelowEqual:
 153                     return Above;
 154                 case Above:
 155                     return BelowEqual;
 156                 case AboveEqual:
 157                     return Below;
 158                 case Overflow:
 159                     return NoOverflow;
 160                 case NoOverflow:
 161                     return Overflow;
 162                 case CarrySet:
 163                     return CarryClear;
 164                 case CarryClear:
 165                     return CarrySet;
 166                 case Negative:
 167                     return Positive;
 168                 case Positive:
 169                     return Negative;
 170                 case Parity:
 171                     return NoParity;
 172                 case NoParity:
 173                     return Parity;
 174             }
 175             throw new IllegalArgumentException();
 176         }
 177 
 178         public int getValue() {
 179             return value;
 180         }
 181 
 182         @Override
 183         public String toString() {
 184             return operator;
 185         }
 186     }
 187 
 188     /**
 189      * Operand size and register type constraints.
 190      */
 191     private enum OpAssertion {
 192         ByteAssertion(CPU, CPU, BYTE),
 193         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 194         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 195         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 196         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 197         QwordAssertion(CPU, CPU, QWORD),
 198         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 199         PackedFloatAssertion(XMM, XMM, PS, PD),
 200         SingleAssertion(XMM, XMM, SS),
 201         DoubleAssertion(XMM, XMM, SD),
 202         PackedDoubleAssertion(XMM, XMM, PD),
 203         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 204         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 205 
 206         private final RegisterCategory resultCategory;
 207         private final RegisterCategory inputCategory;
 208         private final OperandSize[] allowedSizes;
 209 
 210         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 211             this.resultCategory = resultCategory;
 212             this.inputCategory = inputCategory;
 213             this.allowedSizes = allowedSizes;
 214         }
 215 
 216         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 217             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 218             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 219 
 220             for (OperandSize s : allowedSizes) {
 221                 if (size == s) {
 222                     return true;
 223                 }
 224             }
 225 
 226             assert false : "invalid operand size " + size + " used in " + op;
 227             return false;
 228         }
 229 
 230     }
 231 
 232     protected static final int P_0F = 0x0F;
 233     protected static final int P_0F38 = 0x380F;
 234     protected static final int P_0F3A = 0x3A0F;
 235 
 236     /**
 237      * Base class for AMD64 opcodes.
 238      */
 239     public static class AMD64Op {
 240 
 241         private final String opcode;
 242 
 243         protected final int prefix1;
 244         protected final int prefix2;
 245         protected final int op;
 246 
 247         private final boolean dstIsByte;
 248         private final boolean srcIsByte;
 249 
 250         private final OpAssertion assertion;
 251         private final CPUFeature feature;
 252 
 253         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 254             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 255         }
 256 
 257         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 258             this.opcode = opcode;
 259             this.prefix1 = prefix1;
 260             this.prefix2 = prefix2;
 261             this.op = op;
 262 
 263             this.dstIsByte = dstIsByte;
 264             this.srcIsByte = srcIsByte;
 265 
 266             this.assertion = assertion;
 267             this.feature = feature;
 268         }
 269 
 270         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 271             if (prefix1 != 0) {
 272                 asm.emitByte(prefix1);
 273             }
 274             if (size.getSizePrefix() != 0) {
 275                 asm.emitByte(size.getSizePrefix());
 276             }
 277             int rexPrefix = 0x40 | rxb;
 278             if (size == QWORD) {
 279                 rexPrefix |= 0x08;
 280             }
 281             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 282                 asm.emitByte(rexPrefix);
 283             }
 284             if (prefix2 > 0xFF) {
 285                 asm.emitShort(prefix2);
 286             } else if (prefix2 > 0) {
 287                 asm.emitByte(prefix2);
 288             }
 289             asm.emitByte(op);
 290         }
 291 
 292         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 293             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 294             assert assertion.checkOperands(this, size, resultReg, inputReg);
 295             return true;
 296         }
 297 
 298         public OperandSize[] getAllowedSizes() {
 299             return assertion.allowedSizes;
 300         }
 301 
 302         protected final boolean isSSEInstruction() {
 303             if (feature == null) {
 304                 return false;
 305             }
 306             switch (feature) {
 307                 case SSE:
 308                 case SSE2:
 309                 case SSE3:
 310                 case SSSE3:
 311                 case SSE4A:
 312                 case SSE4_1:
 313                 case SSE4_2:
 314                     return true;
 315                 default:
 316                     return false;
 317             }
 318         }
 319 
 320         public final OpAssertion getAssertion() {
 321             return assertion;
 322         }
 323 
 324         @Override
 325         public String toString() {
 326             return opcode;
 327         }
 328     }
 329 
 330     /**
 331      * Base class for AMD64 opcodes with immediate operands.
 332      */
 333     public static class AMD64ImmOp extends AMD64Op {
 334 
 335         private final boolean immIsByte;
 336 
 337         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 338             this(opcode, immIsByte, prefix, op, assertion, null);
 339         }
 340 
 341         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 342             super(opcode, 0, prefix, op, assertion, feature);
 343             this.immIsByte = immIsByte;
 344         }
 345 
 346         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 347             if (immIsByte) {
 348                 assert imm == (byte) imm;
 349                 asm.emitByte(imm);
 350             } else {
 351                 size.emitImmediate(asm, imm);
 352             }
 353         }
 354 
 355         protected final int immediateSize(OperandSize size) {
 356             if (immIsByte) {
 357                 return 1;
 358             } else {
 359                 return size.getBytes();
 360             }
 361         }
 362     }
 363 
 364     /**
 365      * Opcode with operand order of either RM or MR for 2 address forms.
 366      */
 367     public abstract static class AMD64RROp extends AMD64Op {
 368 
 369         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 370             super(opcode, prefix1, prefix2, op, assertion, feature);
 371         }
 372 
 373         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 374             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 375         }
 376 
 377         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 378     }
 379 
 380     /**
 381      * Opcode with operand order of RM.
 382      */
 383     public static class AMD64RMOp extends AMD64RROp {
 384         // @formatter:off
 385         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 386         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 387         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 388         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 389         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 390         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 391         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 392         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 393         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 394         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 395         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 396         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 399         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 400         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 401         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 402 
 403         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 404         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 405         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 407         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408 
 409         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 410         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 411         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 412         // @formatter:on
 413 
 414         protected AMD64RMOp(String opcode, int op) {
 415             this(opcode, 0, op);
 416         }
 417 
 418         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 419             this(opcode, 0, op, assertion);
 420         }
 421 
 422         protected AMD64RMOp(String opcode, int prefix, int op) {
 423             this(opcode, 0, prefix, op, null);
 424         }
 425 
 426         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 427             this(opcode, 0, prefix, op, assertion, null);
 428         }
 429 
 430         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 431             this(opcode, 0, prefix, op, assertion, feature);
 432         }
 433 
 434         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 435             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 436         }
 437 
 438         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 439             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 440         }
 441 
 442         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 443             super(opcode, prefix1, prefix2, op, assertion, feature);
 444         }
 445 
 446         @Override
 447         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 448             assert verify(asm, size, dst, src);
 449             if (isSSEInstruction()) {
 450                 Register nds = Register.None;
 451                 switch (op) {
 452                     case 0x10:
 453                     case 0x51:
 454                         if ((size == SS) || (size == SD)) {
 455                             nds = dst;
 456                         }
 457                         break;
 458                     case 0x2A:
 459                     case 0x54:
 460                     case 0x55:
 461                     case 0x56:
 462                     case 0x57:
 463                     case 0x58:
 464                     case 0x59:
 465                     case 0x5A:
 466                     case 0x5C:
 467                     case 0x5D:
 468                     case 0x5E:
 469                     case 0x5F:
 470                         nds = dst;
 471                         break;
 472                     default:
 473                         break;
 474                 }
 475                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 476                 asm.emitByte(op);
 477                 asm.emitModRM(dst, src);
 478             } else {
 479                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 480                 asm.emitModRM(dst, src);
 481             }
 482         }
 483 
 484         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 485             assert verify(asm, size, dst, null);
 486             if (isSSEInstruction()) {
 487                 Register nds = Register.None;
 488                 switch (op) {
 489                     case 0x51:
 490                         if ((size == SS) || (size == SD)) {
 491                             nds = dst;
 492                         }
 493                         break;
 494                     case 0x2A:
 495                     case 0x54:
 496                     case 0x55:
 497                     case 0x56:
 498                     case 0x57:
 499                     case 0x58:
 500                     case 0x59:
 501                     case 0x5A:
 502                     case 0x5C:
 503                     case 0x5D:
 504                     case 0x5E:
 505                     case 0x5F:
 506                         nds = dst;
 507                         break;
 508                     default:
 509                         break;
 510                 }
 511                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 512                 asm.emitByte(op);
 513                 asm.emitOperandHelper(dst, src, 0);
 514             } else {
 515                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 516                 asm.emitOperandHelper(dst, src, 0);
 517             }
 518         }
 519     }
 520 
 521     /**
 522      * Opcode with operand order of MR.
 523      */
 524     public static class AMD64MROp extends AMD64RROp {
 525         // @formatter:off
 526         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 527         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 528 
 529         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 530         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 531         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 532         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533 
 534         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 535         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 536         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         // @formatter:on
 538 
 539         protected AMD64MROp(String opcode, int op) {
 540             this(opcode, 0, op);
 541         }
 542 
 543         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 544             this(opcode, 0, op, assertion);
 545         }
 546 
 547         protected AMD64MROp(String opcode, int prefix, int op) {
 548             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 549         }
 550 
 551         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 552             this(opcode, prefix, op, assertion, null);
 553         }
 554 
 555         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 556             this(opcode, 0, prefix, op, assertion, feature);
 557         }
 558 
 559         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 560             super(opcode, prefix1, prefix2, op, assertion, feature);
 561         }
 562 
 563         @Override
 564         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 565             assert verify(asm, size, src, dst);
 566             if (isSSEInstruction()) {
 567                 Register nds = Register.None;
 568                 switch (op) {
 569                     case 0x11:
 570                         if ((size == SS) || (size == SD)) {
 571                             nds = src;
 572                         }
 573                         break;
 574                     default:
 575                         break;
 576                 }
 577                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 578                 asm.emitByte(op);
 579                 asm.emitModRM(src, dst);
 580             } else {
 581                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 582                 asm.emitModRM(src, dst);
 583             }
 584         }
 585 
 586         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 587             assert verify(asm, size, src, null);
 588             if (isSSEInstruction()) {
 589                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 590                 asm.emitByte(op);
 591             } else {
 592                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 593             }
 594             asm.emitOperandHelper(src, dst, 0);
 595         }
 596     }
 597 
 598     /**
 599      * Opcodes with operand order of M.
 600      */
 601     public static class AMD64MOp extends AMD64Op {
 602         // @formatter:off
 603         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 604         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 605         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 606         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 607         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 608         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 609         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 610         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 611         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 612         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 613         // @formatter:on
 614 
 615         private final int ext;
 616 
 617         protected AMD64MOp(String opcode, int op, int ext) {
 618             this(opcode, 0, op, ext);
 619         }
 620 
 621         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 622             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 623         }
 624 
 625         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 626             this(opcode, 0, op, ext, assertion);
 627         }
 628 
 629         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 630             super(opcode, 0, prefix, op, assertion, null);
 631             this.ext = ext;
 632         }
 633 
 634         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 635             assert verify(asm, size, dst, null);
 636             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 637             asm.emitModRM(ext, dst);
 638         }
 639 
 640         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 641             assert verify(asm, size, null, null);
 642             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 643             asm.emitOperandHelper(ext, dst, 0);
 644         }
 645     }
 646 
 647     /**
 648      * Opcodes with operand order of MI.
 649      */
 650     public static class AMD64MIOp extends AMD64ImmOp {
 651         // @formatter:off
 652         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 653         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 654         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 655         // @formatter:on
 656 
 657         private final int ext;
 658 
 659         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 660             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 661         }
 662 
 663         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 664             this(opcode, immIsByte, 0, op, ext, assertion);
 665         }
 666 
 667         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 668             super(opcode, immIsByte, prefix, op, assertion);
 669             this.ext = ext;
 670         }
 671 
 672         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 673             emit(asm, size, dst, imm, false);
 674         }
 675 
 676         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 677             assert verify(asm, size, dst, null);
 678             int insnPos = asm.position();
 679             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 680             asm.emitModRM(ext, dst);
 681             int immPos = asm.position();
 682             emitImmediate(asm, size, imm);
 683             int nextInsnPos = asm.position();
 684             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 685                 asm.codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 686             }
 687         }
 688 
 689         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 690             emit(asm, size, dst, imm, false);
 691         }
 692 
 693         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 694             assert verify(asm, size, null, null);
 695             int insnPos = asm.position();
 696             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 697             asm.emitOperandHelper(ext, dst, immediateSize(size));
 698             int immPos = asm.position();
 699             emitImmediate(asm, size, imm);
 700             int nextInsnPos = asm.position();
 701             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 702                 asm.codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 703             }
 704         }
 705     }
 706 
 707     /**
 708      * Opcodes with operand order of RMI.
 709      *
 710      * We only have one form of round as the operation is always treated with single variant input,
 711      * making its extension to 3 address forms redundant.
 712      */
 713     public static class AMD64RMIOp extends AMD64ImmOp {
 714         // @formatter:off
 715         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 716         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 717         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 718         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         // @formatter:on
 720 
 721         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 722             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 723         }
 724 
 725         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 726             super(opcode, immIsByte, prefix, op, assertion, feature);
 727         }
 728 
 729         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 730             assert verify(asm, size, dst, src);
 731             if (isSSEInstruction()) {
 732                 Register nds = Register.None;
 733                 switch (op) {
 734                     case 0x0A:
 735                     case 0x0B:
 736                         nds = dst;
 737                         break;
 738                     default:
 739                         break;
 740                 }
 741                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 742                 asm.emitByte(op);
 743                 asm.emitModRM(dst, src);
 744             } else {
 745                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 746                 asm.emitModRM(dst, src);
 747             }
 748             emitImmediate(asm, size, imm);
 749         }
 750 
 751         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 752             assert verify(asm, size, dst, null);
 753             if (isSSEInstruction()) {
 754                 Register nds = Register.None;
 755                 switch (op) {
 756                     case 0x0A:
 757                     case 0x0B:
 758                         nds = dst;
 759                         break;
 760                     default:
 761                         break;
 762                 }
 763                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 764                 asm.emitByte(op);
 765             } else {
 766                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 767             }
 768             asm.emitOperandHelper(dst, src, immediateSize(size));
 769             emitImmediate(asm, size, imm);
 770         }
 771     }
 772 
 773     public static class SSEOp extends AMD64RMOp {
 774         // @formatter:off
 775         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 776         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 778         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 780         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 781         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 782         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 786         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 787         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 788         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 789         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 790         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 791         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 792         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 793         // @formatter:on
 794 
 795         protected SSEOp(String opcode, int prefix, int op) {
 796             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 797         }
 798 
 799         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 800             this(opcode, 0, prefix, op, assertion);
 801         }
 802 
 803         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 804             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 805         }
 806     }
 807 
 808     /**
 809      * Arithmetic operation with operand order of RM, MR or MI.
 810      */
 811     public static final class AMD64BinaryArithmetic {
 812         // @formatter:off
 813         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 814         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 815         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 816         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 817         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 818         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 819         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 820         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 821         // @formatter:on
 822 
 823         private final AMD64MIOp byteImmOp;
 824         private final AMD64MROp byteMrOp;
 825         private final AMD64RMOp byteRmOp;
 826 
 827         private final AMD64MIOp immOp;
 828         private final AMD64MIOp immSxOp;
 829         private final AMD64MROp mrOp;
 830         private final AMD64RMOp rmOp;
 831 
 832         private AMD64BinaryArithmetic(String opcode, int code) {
 833             int baseOp = code << 3;
 834 
 835             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 836             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 837             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 838 
 839             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 840             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 841             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 842             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 843         }
 844 
 845         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 846             if (size == BYTE) {
 847                 return byteImmOp;
 848             } else if (sx) {
 849                 return immSxOp;
 850             } else {
 851                 return immOp;
 852             }
 853         }
 854 
 855         public AMD64MROp getMROpcode(OperandSize size) {
 856             if (size == BYTE) {
 857                 return byteMrOp;
 858             } else {
 859                 return mrOp;
 860             }
 861         }
 862 
 863         public AMD64RMOp getRMOpcode(OperandSize size) {
 864             if (size == BYTE) {
 865                 return byteRmOp;
 866             } else {
 867                 return rmOp;
 868             }
 869         }
 870     }
 871 
 872     /**
 873      * Shift operation with operand order of M1, MC or MI.
 874      */
 875     public static final class AMD64Shift {
 876         // @formatter:off
 877         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 878         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 879         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 880         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 881         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 882         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 883         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 884         // @formatter:on
 885 
 886         public final AMD64MOp m1Op;
 887         public final AMD64MOp mcOp;
 888         public final AMD64MIOp miOp;
 889 
 890         private AMD64Shift(String opcode, int code) {
 891             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 892             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 893             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 894         }
 895     }
 896 
 897     private enum VEXOpAssertion {
 898         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 899         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 900         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 901         AVX1_128ONLY(CPUFeature.AVX, null),
 902         AVX1_256ONLY(null, CPUFeature.AVX),
 903         AVX2_256ONLY(null, CPUFeature.AVX2),
 904         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 905         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 906         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 907         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 908         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 909         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null);
 910 
 911         private final CPUFeature l128feature;
 912         private final CPUFeature l256feature;
 913 
 914         private final RegisterCategory rCategory;
 915         private final RegisterCategory vCategory;
 916         private final RegisterCategory mCategory;
 917         private final RegisterCategory imm8Category;
 918 
 919         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 920             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 921         }
 922 
 923         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 924             this.l128feature = l128feature;
 925             this.l256feature = l256feature;
 926             this.rCategory = rCategory;
 927             this.vCategory = vCategory;
 928             this.mCategory = mCategory;
 929             this.imm8Category = imm8Category;
 930         }
 931 
 932         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 933             return check(arch, getLFlag(size), r, v, m, null);
 934         }
 935 
 936         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 937             return check(arch, getLFlag(size), r, v, m, imm8);
 938         }
 939 
 940         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 941             switch (l) {
 942                 case L128:
 943                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 944                     break;
 945                 case L256:
 946                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 947                     break;
 948             }
 949             if (r != null) {
 950                 assert r.getRegisterCategory().equals(rCategory);
 951             }
 952             if (v != null) {
 953                 assert v.getRegisterCategory().equals(vCategory);
 954             }
 955             if (m != null) {
 956                 assert m.getRegisterCategory().equals(mCategory);
 957             }
 958             if (imm8 != null) {
 959                 assert imm8.getRegisterCategory().equals(imm8Category);
 960             }
 961             return true;
 962         }
 963 
 964         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 965             switch (avxSize) {
 966                 case XMM:
 967                     return l128feature != null && features.contains(l128feature);
 968                 case YMM:
 969                     return l256feature != null && features.contains(l256feature);
 970                 default:
 971                     throw GraalError.shouldNotReachHere();
 972             }
 973         }
 974     }
 975 
 976     /**
 977      * Base class for VEX-encoded instructions.
 978      */
 979     public static class VexOp {
 980         protected final int pp;
 981         protected final int mmmmm;
 982         protected final int w;
 983         protected final int op;
 984 
 985         private final String opcode;
 986         protected final VEXOpAssertion assertion;
 987 
 988         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 989             this.pp = pp;
 990             this.mmmmm = mmmmm;
 991             this.w = w;
 992             this.op = op;
 993             this.opcode = opcode;
 994             this.assertion = assertion;
 995         }
 996 
 997         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 998             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
 999         }
1000 
1001         @Override
1002         public String toString() {
1003             return opcode;
1004         }
1005     }
1006 
1007     /**
1008      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1009      */
1010     public static class VexRROp extends VexOp {
1011         // @formatter:off
1012         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1013         // @formatter:on
1014 
1015         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1016             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1017         }
1018 
1019         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1020             super(opcode, pp, mmmmm, w, op, assertion);
1021         }
1022 
1023         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1024             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1025             assert op != 0x1A || op != 0x5A;
1026             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w);
1027             asm.emitByte(op);
1028             asm.emitModRM(dst, src);
1029         }
1030     }
1031 
1032     /**
1033      * VEX-encoded instructions with an operand order of RM.
1034      */
1035     public static class VexRMOp extends VexRROp {
1036         // @formatter:off
1037         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1038         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1042         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1044         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1046         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1048         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1049         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1051         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1052         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1056         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1057         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1058         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1059         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1060         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1061         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1062         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1063         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1064         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1065         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1066         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1067         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1068         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1069         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1070         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1074         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1075         // @formatter:on
1076 
1077         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1078             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1079         }
1080 
1081         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1082             super(opcode, pp, mmmmm, w, op, assertion);
1083         }
1084 
1085         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1086             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1087             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w);
1088             asm.emitByte(op);
1089             asm.emitOperandHelper(dst, src, 0);
1090         }
1091     }
1092 
1093     /**
1094      * VEX-encoded move instructions.
1095      * <p>
1096      * These instructions have two opcodes: op is the forward move instruction with an operand order
1097      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1098      */
1099     public static final class VexMoveOp extends VexRMOp {
1100         // @formatter:off
1101         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1102         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1104         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1106         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1110         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         // @formatter:on
1112 
1113         private final int opReverse;
1114 
1115         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1116             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1117         }
1118 
1119         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1120             super(opcode, pp, mmmmm, w, op, assertion);
1121             this.opReverse = opReverse;
1122         }
1123 
1124         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1125             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1126             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w);
1127             asm.emitByte(opReverse);
1128             asm.emitOperandHelper(src, dst, 0);
1129         }
1130 
1131         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1132             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1133             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w);
1134             asm.emitByte(opReverse);
1135             asm.emitModRM(src, dst);
1136         }
1137     }
1138 
1139     public interface VexRRIOp {
1140         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1141     }
1142 
1143     /**
1144      * VEX-encoded instructions with an operand order of RMI.
1145      */
1146     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1147         // @formatter:off
1148         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1149         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1150         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         // @formatter:on
1153 
1154         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1155             super(opcode, pp, mmmmm, w, op, assertion);
1156         }
1157 
1158         @Override
1159         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1160             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1161             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w);
1162             asm.emitByte(op);
1163             asm.emitModRM(dst, src);
1164             asm.emitByte(imm8);
1165         }
1166 
1167         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1168             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1169             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w);
1170             asm.emitByte(op);
1171             asm.emitOperandHelper(dst, src, 1);
1172             asm.emitByte(imm8);
1173         }
1174     }
1175 
1176     /**
1177      * VEX-encoded instructions with an operand order of MRI.
1178      */
1179     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1180         // @formatter:off
1181         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1182         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1183         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1184         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1187         // @formatter:on
1188 
1189         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1190             super(opcode, pp, mmmmm, w, op, assertion);
1191         }
1192 
1193         @Override
1194         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1195             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1196             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w);
1197             asm.emitByte(op);
1198             asm.emitModRM(src, dst);
1199             asm.emitByte(imm8);
1200         }
1201 
1202         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1203             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1204             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w);
1205             asm.emitByte(op);
1206             asm.emitOperandHelper(src, dst, 1);
1207             asm.emitByte(imm8);
1208         }
1209     }
1210 
1211     /**
1212      * VEX-encoded instructions with an operand order of RVMR.
1213      */
1214     public static class VexRVMROp extends VexOp {
1215         // @formatter:off
1216         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1217         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1218         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1219         // @formatter:on
1220 
1221         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1222             super(opcode, pp, mmmmm, w, op, assertion);
1223         }
1224 
1225         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1226             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1227             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1228             asm.emitByte(op);
1229             asm.emitModRM(dst, src2);
1230             asm.emitByte(mask.encoding() << 4);
1231         }
1232 
1233         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1234             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1235             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1236             asm.emitByte(op);
1237             asm.emitOperandHelper(dst, src2, 0);
1238             asm.emitByte(mask.encoding() << 4);
1239         }
1240     }
1241 
1242     /**
1243      * VEX-encoded instructions with an operand order of RVM.
1244      */
1245     public static class VexRVMOp extends VexOp {
1246         // @formatter:off
1247         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1248         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1250         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1252         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1254         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1256         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1260         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1264         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1268         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1272         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1276         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1280         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1282         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1298         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1300         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1304         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1311         // @formatter:on
1312 
1313         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1314             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1315         }
1316 
1317         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1318             super(opcode, pp, mmmmm, w, op, assertion);
1319         }
1320 
1321         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1322             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1323             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1324             asm.emitByte(op);
1325             asm.emitModRM(dst, src2);
1326         }
1327 
1328         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1329             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1330             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1331             asm.emitByte(op);
1332             asm.emitOperandHelper(dst, src2, 0);
1333         }
1334     }
1335 
1336     public static final class VexGeneralPurposeRVMOp extends VexOp {
1337         // @formatter:off
1338         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1339         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1340         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1341         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1342         // @formatter:on
1343 
1344         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1345             super(opcode, pp, mmmmm, w, op, assertion);
1346         }
1347 
1348         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1349             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1350             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1351             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1);
1352             asm.emitByte(op);
1353             asm.emitModRM(dst, src2);
1354         }
1355 
1356         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1357             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1358             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1359             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1);
1360             asm.emitByte(op);
1361             asm.emitOperandHelper(dst, src2, 0);
1362         }
1363     }
1364 
1365     public static final class VexGeneralPurposeRMVOp extends VexOp {
1366         // @formatter:off
1367         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1368         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1369         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1370         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1371         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1372         // @formatter:on
1373 
1374         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1375             super(opcode, pp, mmmmm, w, op, assertion);
1376         }
1377 
1378         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1379             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1380             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1381             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1);
1382             asm.emitByte(op);
1383             asm.emitModRM(dst, src1);
1384         }
1385 
1386         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1387             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1388             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1389             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1);
1390             asm.emitByte(op);
1391             asm.emitOperandHelper(dst, src1, 0);
1392         }
1393     }
1394 
1395     /**
1396      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1397      */
1398     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1399         // @formatter:off
1400         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1401         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1402         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1403         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1404         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1405         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1406         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1407         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1408         // @formatter:on
1409 
1410         private final int immOp;
1411         private final int r;
1412 
1413         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1414             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1415             this.immOp = immOp;
1416             this.r = r;
1417         }
1418 
1419         @Override
1420         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1421             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1422             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w);
1423             asm.emitByte(immOp);
1424             asm.emitModRM(r, src);
1425             asm.emitByte(imm8);
1426         }
1427     }
1428 
1429     public static final class VexMaskMoveOp extends VexOp {
1430         // @formatter:off
1431         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1432         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1433         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1434         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1435         // @formatter:on
1436 
1437         private final int opReverse;
1438 
1439         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1440             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1441         }
1442 
1443         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1444             super(opcode, pp, mmmmm, w, op, assertion);
1445             this.opReverse = opReverse;
1446         }
1447 
1448         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1449             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1450             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w);
1451             asm.emitByte(op);
1452             asm.emitOperandHelper(dst, src, 0);
1453         }
1454 
1455         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1456             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1457             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w);
1458             asm.emitByte(opReverse);
1459             asm.emitOperandHelper(src, dst, 0);
1460         }
1461     }
1462 
1463     /**
1464      * VEX-encoded instructions with an operand order of RVMI.
1465      */
1466     public static final class VexRVMIOp extends VexOp {
1467         // @formatter:off
1468         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1469         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1470         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1471         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1472         // @formatter:on
1473 
1474         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1475             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1476         }
1477 
1478         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1479             super(opcode, pp, mmmmm, w, op, assertion);
1480         }
1481 
1482         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1483             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1484             assert (imm8 & 0xFF) == imm8;
1485             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1486             asm.emitByte(op);
1487             asm.emitModRM(dst, src2);
1488             asm.emitByte(imm8);
1489         }
1490 
1491         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1492             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1493             assert (imm8 & 0xFF) == imm8;
1494             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1495             asm.emitByte(op);
1496             asm.emitOperandHelper(dst, src2, 1);
1497             asm.emitByte(imm8);
1498         }
1499     }
1500 
1501     /**
1502      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1503      * comparison operator.
1504      */
1505     public static final class VexFloatCompareOp extends VexOp {
1506         // @formatter:off
1507         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1508         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1509         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1510         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1511         // @formatter:on
1512 
1513         public enum Predicate {
1514             EQ_OQ(0x00),
1515             LT_OS(0x01),
1516             LE_OS(0x02),
1517             UNORD_Q(0x03),
1518             NEQ_UQ(0x04),
1519             NLT_US(0x05),
1520             NLE_US(0x06),
1521             ORD_Q(0x07),
1522             EQ_UQ(0x08),
1523             NGE_US(0x09),
1524             NGT_US(0x0a),
1525             FALSE_OQ(0x0b),
1526             NEQ_OQ(0x0c),
1527             GE_OS(0x0d),
1528             GT_OS(0x0e),
1529             TRUE_UQ(0x0f),
1530             EQ_OS(0x10),
1531             LT_OQ(0x11),
1532             LE_OQ(0x12),
1533             UNORD_S(0x13),
1534             NEQ_US(0x14),
1535             NLT_UQ(0x15),
1536             NLE_UQ(0x16),
1537             ORD_S(0x17),
1538             EQ_US(0x18),
1539             NGE_UQ(0x19),
1540             NGT_UQ(0x1a),
1541             FALSE_OS(0x1b),
1542             NEQ_OS(0x1c),
1543             GE_OQ(0x1d),
1544             GT_OQ(0x1e),
1545             TRUE_US(0x1f);
1546 
1547             private int imm8;
1548 
1549             Predicate(int imm8) {
1550                 this.imm8 = imm8;
1551             }
1552 
1553             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1554                 if (unorderedIsTrue) {
1555                     switch (condition) {
1556                         case EQ:
1557                             return EQ_UQ;
1558                         case NE:
1559                             return NEQ_UQ;
1560                         case LT:
1561                             return NGE_UQ;
1562                         case LE:
1563                             return NGT_UQ;
1564                         case GT:
1565                             return NLE_UQ;
1566                         case GE:
1567                             return NLT_UQ;
1568                         default:
1569                             throw GraalError.shouldNotReachHere();
1570                     }
1571                 } else {
1572                     switch (condition) {
1573                         case EQ:
1574                             return EQ_OQ;
1575                         case NE:
1576                             return NEQ_OQ;
1577                         case LT:
1578                             return LT_OQ;
1579                         case LE:
1580                             return LE_OQ;
1581                         case GT:
1582                             return GT_OQ;
1583                         case GE:
1584                             return GE_OQ;
1585                         default:
1586                             throw GraalError.shouldNotReachHere();
1587                     }
1588                 }
1589             }
1590         }
1591 
1592         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1593             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1594         }
1595 
1596         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1597             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1598             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1599             asm.emitByte(op);
1600             asm.emitModRM(dst, src2);
1601             asm.emitByte(p.imm8);
1602         }
1603 
1604         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1605             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1606             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w);
1607             asm.emitByte(op);
1608             asm.emitOperandHelper(dst, src2, 1);
1609             asm.emitByte(p.imm8);
1610         }
1611     }
1612 
1613     public final void addl(AMD64Address dst, int imm32) {
1614         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1615     }
1616 
1617     public final void addl(Register dst, int imm32) {
1618         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1619     }
1620 
1621     public final void addl(Register dst, Register src) {
1622         ADD.rmOp.emit(this, DWORD, dst, src);
1623     }
1624 
1625     public final void addpd(Register dst, Register src) {
1626         SSEOp.ADD.emit(this, PD, dst, src);
1627     }
1628 
1629     public final void addpd(Register dst, AMD64Address src) {
1630         SSEOp.ADD.emit(this, PD, dst, src);
1631     }
1632 
1633     public final void addsd(Register dst, Register src) {
1634         SSEOp.ADD.emit(this, SD, dst, src);
1635     }
1636 
1637     public final void addsd(Register dst, AMD64Address src) {
1638         SSEOp.ADD.emit(this, SD, dst, src);
1639     }
1640 
1641     private void addrNop4() {
1642         // 4 bytes: NOP DWORD PTR [EAX+0]
1643         emitByte(0x0F);
1644         emitByte(0x1F);
1645         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1646         emitByte(0); // 8-bits offset (1 byte)
1647     }
1648 
1649     private void addrNop5() {
1650         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1651         emitByte(0x0F);
1652         emitByte(0x1F);
1653         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1654         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1655         emitByte(0); // 8-bits offset (1 byte)
1656     }
1657 
1658     private void addrNop7() {
1659         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1660         emitByte(0x0F);
1661         emitByte(0x1F);
1662         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1663         emitInt(0); // 32-bits offset (4 bytes)
1664     }
1665 
1666     private void addrNop8() {
1667         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1668         emitByte(0x0F);
1669         emitByte(0x1F);
1670         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1671         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1672         emitInt(0); // 32-bits offset (4 bytes)
1673     }
1674 
1675     public final void andl(Register dst, int imm32) {
1676         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1677     }
1678 
1679     public final void andl(Register dst, Register src) {
1680         AND.rmOp.emit(this, DWORD, dst, src);
1681     }
1682 
1683     public final void andpd(Register dst, Register src) {
1684         SSEOp.AND.emit(this, PD, dst, src);
1685     }
1686 
1687     public final void andpd(Register dst, AMD64Address src) {
1688         SSEOp.AND.emit(this, PD, dst, src);
1689     }
1690 
1691     public final void bsfq(Register dst, Register src) {
1692         prefixq(dst, src);
1693         emitByte(0x0F);
1694         emitByte(0xBC);
1695         emitModRM(dst, src);
1696     }
1697 
1698     public final void bsrl(Register dst, Register src) {
1699         prefix(dst, src);
1700         emitByte(0x0F);
1701         emitByte(0xBD);
1702         emitModRM(dst, src);
1703     }
1704 
1705     public final void bswapl(Register reg) {
1706         prefix(reg);
1707         emitByte(0x0F);
1708         emitModRM(1, reg);
1709     }
1710 
1711     public final void cdql() {
1712         emitByte(0x99);
1713     }
1714 
1715     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1716         prefix(dst, src);
1717         emitByte(0x0F);
1718         emitByte(0x40 | cc.getValue());
1719         emitModRM(dst, src);
1720     }
1721 
1722     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1723         prefix(src, dst);
1724         emitByte(0x0F);
1725         emitByte(0x40 | cc.getValue());
1726         emitOperandHelper(dst, src, 0);
1727     }
1728 
1729     public final void cmpl(Register dst, int imm32) {
1730         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1731     }
1732 
1733     public final void cmpl(Register dst, Register src) {
1734         CMP.rmOp.emit(this, DWORD, dst, src);
1735     }
1736 
1737     public final void cmpl(Register dst, AMD64Address src) {
1738         CMP.rmOp.emit(this, DWORD, dst, src);
1739     }
1740 
1741     public final void cmpl(AMD64Address dst, int imm32) {
1742         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1743     }
1744 
1745     /**
1746      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1747      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1748      * values were equal, and cleared otherwise.
1749      */
1750     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1751         prefixb(adr, reg);
1752         emitByte(0x0F);
1753         emitByte(0xB0);
1754         emitOperandHelper(reg, adr, 0);
1755     }
1756 
1757     /**
1758      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1759      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1760      * compared values were equal, and cleared otherwise.
1761      */
1762     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1763         emitByte(0x66); // Switch to 16-bit mode.
1764         prefix(adr, reg);
1765         emitByte(0x0F);
1766         emitByte(0xB1);
1767         emitOperandHelper(reg, adr, 0);
1768     }
1769 
1770     /**
1771      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1772      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1773      * compared values were equal, and cleared otherwise.
1774      */
1775     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1776         prefix(adr, reg);
1777         emitByte(0x0F);
1778         emitByte(0xB1);
1779         emitOperandHelper(reg, adr, 0);
1780     }
1781 
1782     public final void cvtsi2sdl(Register dst, Register src) {
1783         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1784     }
1785 
1786     public final void cvttsd2sil(Register dst, Register src) {
1787         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1788     }
1789 
1790     public final void decl(AMD64Address dst) {
1791         prefix(dst);
1792         emitByte(0xFF);
1793         emitOperandHelper(1, dst, 0);
1794     }
1795 
1796     public final void divsd(Register dst, Register src) {
1797         SSEOp.DIV.emit(this, SD, dst, src);
1798     }
1799 
1800     public final void hlt() {
1801         emitByte(0xF4);
1802     }
1803 
1804     public final void imull(Register dst, Register src, int value) {
1805         if (isByte(value)) {
1806             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1807         } else {
1808             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1809         }
1810     }
1811 
1812     public final void incl(AMD64Address dst) {
1813         prefix(dst);
1814         emitByte(0xFF);
1815         emitOperandHelper(0, dst, 0);
1816     }
1817 
1818     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1819         int shortSize = 2;
1820         int longSize = 6;
1821         long disp = jumpTarget - position();
1822         if (!forceDisp32 && isByte(disp - shortSize)) {
1823             // 0111 tttn #8-bit disp
1824             emitByte(0x70 | cc.getValue());
1825             emitByte((int) ((disp - shortSize) & 0xFF));
1826         } else {
1827             // 0000 1111 1000 tttn #32-bit disp
1828             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1829             emitByte(0x0F);
1830             emitByte(0x80 | cc.getValue());
1831             emitInt((int) (disp - longSize));
1832         }
1833     }
1834 
1835     public final void jcc(ConditionFlag cc, Label l) {
1836         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1837         if (l.isBound()) {
1838             jcc(cc, l.position(), false);
1839         } else {
1840             // Note: could eliminate cond. jumps to this jump if condition
1841             // is the same however, seems to be rather unlikely case.
1842             // Note: use jccb() if label to be bound is very close to get
1843             // an 8-bit displacement
1844             l.addPatchAt(position());
1845             emitByte(0x0F);
1846             emitByte(0x80 | cc.getValue());
1847             emitInt(0);
1848         }
1849 
1850     }
1851 
1852     public final void jccb(ConditionFlag cc, Label l) {
1853         if (l.isBound()) {
1854             int shortSize = 2;
1855             int entry = l.position();
1856             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1857             long disp = entry - position();
1858             // 0111 tttn #8-bit disp
1859             emitByte(0x70 | cc.getValue());
1860             emitByte((int) ((disp - shortSize) & 0xFF));
1861         } else {
1862             l.addPatchAt(position());
1863             emitByte(0x70 | cc.getValue());
1864             emitByte(0);
1865         }
1866     }
1867 
1868     public final void jmp(int jumpTarget, boolean forceDisp32) {
1869         int shortSize = 2;
1870         int longSize = 5;
1871         long disp = jumpTarget - position();
1872         if (!forceDisp32 && isByte(disp - shortSize)) {
1873             emitByte(0xEB);
1874             emitByte((int) ((disp - shortSize) & 0xFF));
1875         } else {
1876             emitByte(0xE9);
1877             emitInt((int) (disp - longSize));
1878         }
1879     }
1880 
1881     @Override
1882     public final void jmp(Label l) {
1883         if (l.isBound()) {
1884             jmp(l.position(), false);
1885         } else {
1886             // By default, forward jumps are always 32-bit displacements, since
1887             // we can't yet know where the label will be bound. If you're sure that
1888             // the forward jump will not run beyond 256 bytes, use jmpb to
1889             // force an 8-bit displacement.
1890 
1891             l.addPatchAt(position());
1892             emitByte(0xE9);
1893             emitInt(0);
1894         }
1895     }
1896 
1897     public final void jmp(Register entry) {
1898         prefix(entry);
1899         emitByte(0xFF);
1900         emitModRM(4, entry);
1901     }
1902 
1903     public final void jmp(AMD64Address adr) {
1904         prefix(adr);
1905         emitByte(0xFF);
1906         emitOperandHelper(AMD64.rsp, adr, 0);
1907     }
1908 
1909     public final void jmpb(Label l) {
1910         if (l.isBound()) {
1911             int shortSize = 2;
1912             int entry = l.position();
1913             assert isByte((entry - position()) + shortSize) : "Dispacement too large for a short jmp";
1914             long offs = entry - position();
1915             emitByte(0xEB);
1916             emitByte((int) ((offs - shortSize) & 0xFF));
1917         } else {
1918 
1919             l.addPatchAt(position());
1920             emitByte(0xEB);
1921             emitByte(0);
1922         }
1923     }
1924 
1925     public final void lead(Register dst, AMD64Address src) {
1926         prefix(src, dst);
1927         emitByte(0x8D);
1928         emitOperandHelper(dst, src, 0);
1929     }
1930 
1931     public final void leaq(Register dst, AMD64Address src) {
1932         prefixq(src, dst);
1933         emitByte(0x8D);
1934         emitOperandHelper(dst, src, 0);
1935     }
1936 
1937     public final void leave() {
1938         emitByte(0xC9);
1939     }
1940 
1941     public final void lock() {
1942         emitByte(0xF0);
1943     }
1944 
1945     public final void movapd(Register dst, Register src) {
1946         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
1947         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1948         emitByte(0x28);
1949         emitModRM(dst, src);
1950     }
1951 
1952     public final void movaps(Register dst, Register src) {
1953         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
1954         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1955         emitByte(0x28);
1956         emitModRM(dst, src);
1957     }
1958 
1959     public final void movb(AMD64Address dst, int imm8) {
1960         prefix(dst);
1961         emitByte(0xC6);
1962         emitOperandHelper(0, dst, 1);
1963         emitByte(imm8);
1964     }
1965 
1966     public final void movb(AMD64Address dst, Register src) {
1967         assert src.getRegisterCategory().equals(CPU) : "must have byte register";
1968         prefixb(dst, src);
1969         emitByte(0x88);
1970         emitOperandHelper(src, dst, 0);
1971     }
1972 
1973     public final void movl(Register dst, int imm32) {
1974         movl(dst, imm32, false);
1975     }
1976 
1977     public final void movl(Register dst, int imm32, boolean annotateImm) {
1978         int insnPos = position();
1979         prefix(dst);
1980         emitByte(0xB8 + encode(dst));
1981         int immPos = position();
1982         emitInt(imm32);
1983         int nextInsnPos = position();
1984         if (annotateImm && codePatchingAnnotationConsumer != null) {
1985             codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
1986         }
1987     }
1988 
1989     public final void movl(Register dst, Register src) {
1990         prefix(dst, src);
1991         emitByte(0x8B);
1992         emitModRM(dst, src);
1993     }
1994 
1995     public final void movl(Register dst, AMD64Address src) {
1996         prefix(src, dst);
1997         emitByte(0x8B);
1998         emitOperandHelper(dst, src, 0);
1999     }
2000 
2001     /**
2002      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2003      */
2004     public final void movl(Register dst, AMD64Address src, boolean wide) {
2005         prefix(src, dst);
2006         emitByte(0x8B);
2007         emitOperandHelper(dst, src, wide, 0);
2008     }
2009 
2010     public final void movl(AMD64Address dst, int imm32) {
2011         prefix(dst);
2012         emitByte(0xC7);
2013         emitOperandHelper(0, dst, 4);
2014         emitInt(imm32);
2015     }
2016 
2017     public final void movl(AMD64Address dst, Register src) {
2018         prefix(dst, src);
2019         emitByte(0x89);
2020         emitOperandHelper(src, dst, 0);
2021     }
2022 
2023     /**
2024      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2025      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2026      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2027      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2028      */
2029     public final void movlpd(Register dst, AMD64Address src) {
2030         assert dst.getRegisterCategory().equals(XMM);
2031         simdPrefix(dst, dst, src, PD, P_0F, false);
2032         emitByte(0x12);
2033         emitOperandHelper(dst, src, 0);
2034     }
2035 
2036     public final void movlhps(Register dst, Register src) {
2037         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2038         simdPrefix(dst, src, src, PS, P_0F, false);
2039         emitByte(0x16);
2040         emitModRM(dst, src);
2041     }
2042 
2043     public final void movq(Register dst, AMD64Address src) {
2044         movq(dst, src, false);
2045     }
2046 
2047     public final void movq(Register dst, AMD64Address src, boolean wide) {
2048         if (dst.getRegisterCategory().equals(XMM)) {
2049             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2050             emitByte(0x7E);
2051             emitOperandHelper(dst, src, wide, 0);
2052         } else {
2053             // gpr version of movq
2054             prefixq(src, dst);
2055             emitByte(0x8B);
2056             emitOperandHelper(dst, src, wide, 0);
2057         }
2058     }
2059 
2060     public final void movq(Register dst, Register src) {
2061         prefixq(dst, src);
2062         emitByte(0x8B);
2063         emitModRM(dst, src);
2064     }
2065 
2066     public final void movq(AMD64Address dst, Register src) {
2067         if (src.getRegisterCategory().equals(XMM)) {
2068             simdPrefix(src, Register.None, dst, PD, P_0F, true);
2069             emitByte(0xD6);
2070             emitOperandHelper(src, dst, 0);
2071         } else {
2072             // gpr version of movq
2073             prefixq(dst, src);
2074             emitByte(0x89);
2075             emitOperandHelper(src, dst, 0);
2076         }
2077     }
2078 
2079     public final void movsbl(Register dst, AMD64Address src) {
2080         prefix(src, dst);
2081         emitByte(0x0F);
2082         emitByte(0xBE);
2083         emitOperandHelper(dst, src, 0);
2084     }
2085 
2086     public final void movsbl(Register dst, Register src) {
2087         prefix(dst, false, src, true);
2088         emitByte(0x0F);
2089         emitByte(0xBE);
2090         emitModRM(dst, src);
2091     }
2092 
2093     public final void movsbq(Register dst, AMD64Address src) {
2094         prefixq(src, dst);
2095         emitByte(0x0F);
2096         emitByte(0xBE);
2097         emitOperandHelper(dst, src, 0);
2098     }
2099 
2100     public final void movsbq(Register dst, Register src) {
2101         prefixq(dst, src);
2102         emitByte(0x0F);
2103         emitByte(0xBE);
2104         emitModRM(dst, src);
2105     }
2106 
2107     public final void movsd(Register dst, Register src) {
2108         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2109     }
2110 
2111     public final void movsd(Register dst, AMD64Address src) {
2112         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2113     }
2114 
2115     public final void movsd(AMD64Address dst, Register src) {
2116         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2117     }
2118 
2119     public final void movss(Register dst, Register src) {
2120         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2121     }
2122 
2123     public final void movss(Register dst, AMD64Address src) {
2124         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2125     }
2126 
2127     public final void movss(AMD64Address dst, Register src) {
2128         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2129     }
2130 
2131     public final void mulpd(Register dst, Register src) {
2132         SSEOp.MUL.emit(this, PD, dst, src);
2133     }
2134 
2135     public final void mulpd(Register dst, AMD64Address src) {
2136         SSEOp.MUL.emit(this, PD, dst, src);
2137     }
2138 
2139     public final void mulsd(Register dst, Register src) {
2140         SSEOp.MUL.emit(this, SD, dst, src);
2141     }
2142 
2143     public final void mulsd(Register dst, AMD64Address src) {
2144         SSEOp.MUL.emit(this, SD, dst, src);
2145     }
2146 
2147     public final void mulss(Register dst, Register src) {
2148         SSEOp.MUL.emit(this, SS, dst, src);
2149     }
2150 
2151     public final void movswl(Register dst, AMD64Address src) {
2152         prefix(src, dst);
2153         emitByte(0x0F);
2154         emitByte(0xBF);
2155         emitOperandHelper(dst, src, 0);
2156     }
2157 
2158     public final void movw(AMD64Address dst, int imm16) {
2159         emitByte(0x66); // switch to 16-bit mode
2160         prefix(dst);
2161         emitByte(0xC7);
2162         emitOperandHelper(0, dst, 2);
2163         emitShort(imm16);
2164     }
2165 
2166     public final void movw(AMD64Address dst, Register src) {
2167         emitByte(0x66);
2168         prefix(dst, src);
2169         emitByte(0x89);
2170         emitOperandHelper(src, dst, 0);
2171     }
2172 
2173     public final void movzbl(Register dst, AMD64Address src) {
2174         prefix(src, dst);
2175         emitByte(0x0F);
2176         emitByte(0xB6);
2177         emitOperandHelper(dst, src, 0);
2178     }
2179 
2180     public final void movzbl(Register dst, Register src) {
2181         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2182     }
2183 
2184     public final void movzbq(Register dst, Register src) {
2185         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2186     }
2187 
2188     public final void movzwl(Register dst, AMD64Address src) {
2189         prefix(src, dst);
2190         emitByte(0x0F);
2191         emitByte(0xB7);
2192         emitOperandHelper(dst, src, 0);
2193     }
2194 
2195     public final void negl(Register dst) {
2196         NEG.emit(this, DWORD, dst);
2197     }
2198 
2199     public final void notl(Register dst) {
2200         NOT.emit(this, DWORD, dst);
2201     }
2202 
2203     public final void notq(Register dst) {
2204         NOT.emit(this, QWORD, dst);
2205     }
2206 
2207     @Override
2208     public final void ensureUniquePC() {
2209         nop();
2210     }
2211 
2212     public final void nop() {
2213         nop(1);
2214     }
2215 
2216     public void nop(int count) {
2217         int i = count;
2218         if (UseNormalNop) {
2219             assert i > 0 : " ";
2220             // The fancy nops aren't currently recognized by debuggers making it a
2221             // pain to disassemble code while debugging. If assert are on clearly
2222             // speed is not an issue so simply use the single byte traditional nop
2223             // to do alignment.
2224 
2225             for (; i > 0; i--) {
2226                 emitByte(0x90);
2227             }
2228             return;
2229         }
2230 
2231         if (UseAddressNop) {
2232             //
2233             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2234             // 1: 0x90
2235             // 2: 0x66 0x90
2236             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2237             // 4: 0x0F 0x1F 0x40 0x00
2238             // 5: 0x0F 0x1F 0x44 0x00 0x00
2239             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2240             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2241             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2242             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2243             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2244             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2245 
2246             // The rest coding is AMD specific - use consecutive Address nops
2247 
2248             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2249             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2250             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2251             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2252             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2253             // Size prefixes (0x66) are added for larger sizes
2254 
2255             while (i >= 22) {
2256                 i -= 11;
2257                 emitByte(0x66); // size prefix
2258                 emitByte(0x66); // size prefix
2259                 emitByte(0x66); // size prefix
2260                 addrNop8();
2261             }
2262             // Generate first nop for size between 21-12
2263             switch (i) {
2264                 case 21:
2265                     i -= 11;
2266                     emitByte(0x66); // size prefix
2267                     emitByte(0x66); // size prefix
2268                     emitByte(0x66); // size prefix
2269                     addrNop8();
2270                     break;
2271                 case 20:
2272                 case 19:
2273                     i -= 10;
2274                     emitByte(0x66); // size prefix
2275                     emitByte(0x66); // size prefix
2276                     addrNop8();
2277                     break;
2278                 case 18:
2279                 case 17:
2280                     i -= 9;
2281                     emitByte(0x66); // size prefix
2282                     addrNop8();
2283                     break;
2284                 case 16:
2285                 case 15:
2286                     i -= 8;
2287                     addrNop8();
2288                     break;
2289                 case 14:
2290                 case 13:
2291                     i -= 7;
2292                     addrNop7();
2293                     break;
2294                 case 12:
2295                     i -= 6;
2296                     emitByte(0x66); // size prefix
2297                     addrNop5();
2298                     break;
2299                 default:
2300                     assert i < 12;
2301             }
2302 
2303             // Generate second nop for size between 11-1
2304             switch (i) {
2305                 case 11:
2306                     emitByte(0x66); // size prefix
2307                     emitByte(0x66); // size prefix
2308                     emitByte(0x66); // size prefix
2309                     addrNop8();
2310                     break;
2311                 case 10:
2312                     emitByte(0x66); // size prefix
2313                     emitByte(0x66); // size prefix
2314                     addrNop8();
2315                     break;
2316                 case 9:
2317                     emitByte(0x66); // size prefix
2318                     addrNop8();
2319                     break;
2320                 case 8:
2321                     addrNop8();
2322                     break;
2323                 case 7:
2324                     addrNop7();
2325                     break;
2326                 case 6:
2327                     emitByte(0x66); // size prefix
2328                     addrNop5();
2329                     break;
2330                 case 5:
2331                     addrNop5();
2332                     break;
2333                 case 4:
2334                     addrNop4();
2335                     break;
2336                 case 3:
2337                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2338                     emitByte(0x66); // size prefix
2339                     emitByte(0x66); // size prefix
2340                     emitByte(0x90); // nop
2341                     break;
2342                 case 2:
2343                     emitByte(0x66); // size prefix
2344                     emitByte(0x90); // nop
2345                     break;
2346                 case 1:
2347                     emitByte(0x90); // nop
2348                     break;
2349                 default:
2350                     assert i == 0;
2351             }
2352             return;
2353         }
2354 
2355         // Using nops with size prefixes "0x66 0x90".
2356         // From AMD Optimization Guide:
2357         // 1: 0x90
2358         // 2: 0x66 0x90
2359         // 3: 0x66 0x66 0x90
2360         // 4: 0x66 0x66 0x66 0x90
2361         // 5: 0x66 0x66 0x90 0x66 0x90
2362         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2363         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2364         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2365         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2366         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2367         //
2368         while (i > 12) {
2369             i -= 4;
2370             emitByte(0x66); // size prefix
2371             emitByte(0x66);
2372             emitByte(0x66);
2373             emitByte(0x90); // nop
2374         }
2375         // 1 - 12 nops
2376         if (i > 8) {
2377             if (i > 9) {
2378                 i -= 1;
2379                 emitByte(0x66);
2380             }
2381             i -= 3;
2382             emitByte(0x66);
2383             emitByte(0x66);
2384             emitByte(0x90);
2385         }
2386         // 1 - 8 nops
2387         if (i > 4) {
2388             if (i > 6) {
2389                 i -= 1;
2390                 emitByte(0x66);
2391             }
2392             i -= 3;
2393             emitByte(0x66);
2394             emitByte(0x66);
2395             emitByte(0x90);
2396         }
2397         switch (i) {
2398             case 4:
2399                 emitByte(0x66);
2400                 emitByte(0x66);
2401                 emitByte(0x66);
2402                 emitByte(0x90);
2403                 break;
2404             case 3:
2405                 emitByte(0x66);
2406                 emitByte(0x66);
2407                 emitByte(0x90);
2408                 break;
2409             case 2:
2410                 emitByte(0x66);
2411                 emitByte(0x90);
2412                 break;
2413             case 1:
2414                 emitByte(0x90);
2415                 break;
2416             default:
2417                 assert i == 0;
2418         }
2419     }
2420 
2421     public final void orl(Register dst, Register src) {
2422         OR.rmOp.emit(this, DWORD, dst, src);
2423     }
2424 
2425     public final void orl(Register dst, int imm32) {
2426         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2427     }
2428 
2429     public final void pop(Register dst) {
2430         prefix(dst);
2431         emitByte(0x58 + encode(dst));
2432     }
2433 
2434     public void popfq() {
2435         emitByte(0x9D);
2436     }
2437 
2438     public final void ptest(Register dst, Register src) {
2439         assert supports(CPUFeature.SSE4_1);
2440         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2441         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2442         emitByte(0x17);
2443         emitModRM(dst, src);
2444     }
2445 
2446     public final void pcmpeqb(Register dst, Register src) {
2447         assert supports(CPUFeature.SSE2);
2448         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2449         simdPrefix(dst, dst, src, PD, P_0F, false);
2450         emitByte(0x74);
2451         emitModRM(dst, src);
2452     }
2453 
2454     public final void pcmpeqw(Register dst, Register src) {
2455         assert supports(CPUFeature.SSE2);
2456         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2457         simdPrefix(dst, dst, src, PD, P_0F, false);
2458         emitByte(0x75);
2459         emitModRM(dst, src);
2460     }
2461 
2462     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2463         assert supports(CPUFeature.SSE4_2);
2464         assert dst.getRegisterCategory().equals(XMM);
2465         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2466         emitByte(0x61);
2467         emitOperandHelper(dst, src, 0);
2468         emitByte(imm8);
2469     }
2470 
2471     public final void pcmpestri(Register dst, Register src, int imm8) {
2472         assert supports(CPUFeature.SSE4_2);
2473         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2474         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2475         emitByte(0x61);
2476         emitModRM(dst, src);
2477         emitByte(imm8);
2478     }
2479 
2480     public final void pmovmskb(Register dst, Register src) {
2481         assert supports(CPUFeature.SSE2);
2482         assert dst.getRegisterCategory().equals(CPU) && src.getRegisterCategory().equals(XMM);
2483         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2484         emitByte(0xD7);
2485         emitModRM(dst, src);
2486     }
2487 
2488     public final void pmovzxbw(Register dst, AMD64Address src) {
2489         assert supports(CPUFeature.SSE4_2);
2490         assert dst.getRegisterCategory().equals(XMM);
2491         // XXX legacy_mode should be: _legacy_mode_bw
2492         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2493         emitByte(0x30);
2494         emitOperandHelper(dst, src, 0);
2495     }
2496 
2497     public final void push(Register src) {
2498         prefix(src);
2499         emitByte(0x50 + encode(src));
2500     }
2501 
2502     public void pushfq() {
2503         emitByte(0x9c);
2504     }
2505 
2506     public final void paddd(Register dst, Register src) {
2507         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2508         simdPrefix(dst, dst, src, PD, P_0F, false);
2509         emitByte(0xFE);
2510         emitModRM(dst, src);
2511     }
2512 
2513     public final void paddq(Register dst, Register src) {
2514         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2515         simdPrefix(dst, dst, src, PD, P_0F, false);
2516         emitByte(0xD4);
2517         emitModRM(dst, src);
2518     }
2519 
2520     public final void pextrw(Register dst, Register src, int imm8) {
2521         assert dst.getRegisterCategory().equals(CPU) && src.getRegisterCategory().equals(XMM);
2522         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2523         emitByte(0xC5);
2524         emitModRM(dst, src);
2525         emitByte(imm8);
2526     }
2527 
2528     public final void pinsrw(Register dst, Register src, int imm8) {
2529         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(CPU);
2530         simdPrefix(dst, dst, src, PD, P_0F, false);
2531         emitByte(0xC4);
2532         emitModRM(dst, src);
2533         emitByte(imm8);
2534     }
2535 
2536     public final void por(Register dst, Register src) {
2537         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2538         simdPrefix(dst, dst, src, PD, P_0F, false);
2539         emitByte(0xEB);
2540         emitModRM(dst, src);
2541     }
2542 
2543     public final void pand(Register dst, Register src) {
2544         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2545         simdPrefix(dst, dst, src, PD, P_0F, false);
2546         emitByte(0xDB);
2547         emitModRM(dst, src);
2548     }
2549 
2550     public final void pxor(Register dst, Register src) {
2551         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2552         simdPrefix(dst, dst, src, PD, P_0F, false);
2553         emitByte(0xEF);
2554         emitModRM(dst, src);
2555     }
2556 
2557     public final void pslld(Register dst, int imm8) {
2558         assert isUByte(imm8) : "invalid value";
2559         assert dst.getRegisterCategory().equals(XMM);
2560         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2561         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2562         emitByte(0x72);
2563         emitModRM(6, dst);
2564         emitByte(imm8 & 0xFF);
2565     }
2566 
2567     public final void psllq(Register dst, Register shift) {
2568         assert dst.getRegisterCategory().equals(XMM) && shift.getRegisterCategory().equals(XMM);
2569         simdPrefix(dst, dst, shift, PD, P_0F, false);
2570         emitByte(0xF3);
2571         emitModRM(dst, shift);
2572     }
2573 
2574     public final void psllq(Register dst, int imm8) {
2575         assert isUByte(imm8) : "invalid value";
2576         assert dst.getRegisterCategory().equals(XMM);
2577         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2578         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2579         emitByte(0x73);
2580         emitModRM(6, dst);
2581         emitByte(imm8);
2582     }
2583 
2584     public final void psrad(Register dst, int imm8) {
2585         assert isUByte(imm8) : "invalid value";
2586         assert dst.getRegisterCategory().equals(XMM);
2587         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2588         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2589         emitByte(0x72);
2590         emitModRM(4, dst);
2591         emitByte(imm8);
2592     }
2593 
2594     public final void psrld(Register dst, int imm8) {
2595         assert isUByte(imm8) : "invalid value";
2596         assert dst.getRegisterCategory().equals(XMM);
2597         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2598         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2599         emitByte(0x72);
2600         emitModRM(2, dst);
2601         emitByte(imm8);
2602     }
2603 
2604     public final void psrlq(Register dst, int imm8) {
2605         assert isUByte(imm8) : "invalid value";
2606         assert dst.getRegisterCategory().equals(XMM);
2607         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2608         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2609         emitByte(0x73);
2610         emitModRM(2, dst);
2611         emitByte(imm8);
2612     }
2613 
2614     public final void psrldq(Register dst, int imm8) {
2615         assert isUByte(imm8) : "invalid value";
2616         assert dst.getRegisterCategory().equals(XMM);
2617         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2618         emitByte(0x73);
2619         emitModRM(3, dst);
2620         emitByte(imm8);
2621     }
2622 
2623     public final void pshufb(Register dst, Register src) {
2624         assert supports(CPUFeature.SSSE3);
2625         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2626         simdPrefix(dst, dst, src, PD, P_0F38, false);
2627         emitByte(0x00);
2628         emitModRM(dst, src);
2629     }
2630 
2631     public final void pshuflw(Register dst, Register src, int imm8) {
2632         assert supports(CPUFeature.SSE2);
2633         assert isUByte(imm8) : "invalid value";
2634         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2635         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2636         emitByte(0x70);
2637         emitModRM(dst, src);
2638         emitByte(imm8);
2639     }
2640 
2641     public final void pshufd(Register dst, Register src, int imm8) {
2642         assert isUByte(imm8) : "invalid value";
2643         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2644         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2645         emitByte(0x70);
2646         emitModRM(dst, src);
2647         emitByte(imm8);
2648     }
2649 
2650     public final void psubd(Register dst, Register src) {
2651         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2652         simdPrefix(dst, dst, src, PD, P_0F, false);
2653         emitByte(0xFA);
2654         emitModRM(dst, src);
2655     }
2656 
2657     public final void rcpps(Register dst, Register src) {
2658         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2659         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2660         emitByte(0x53);
2661         emitModRM(dst, src);
2662     }
2663 
2664     public final void ret(int imm16) {
2665         if (imm16 == 0) {
2666             emitByte(0xC3);
2667         } else {
2668             emitByte(0xC2);
2669             emitShort(imm16);
2670         }
2671     }
2672 
2673     public final void sarl(Register dst, int imm8) {
2674         prefix(dst);
2675         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2676         if (imm8 == 1) {
2677             emitByte(0xD1);
2678             emitModRM(7, dst);
2679         } else {
2680             emitByte(0xC1);
2681             emitModRM(7, dst);
2682             emitByte(imm8);
2683         }
2684     }
2685 
2686     public final void shll(Register dst, int imm8) {
2687         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2688         prefix(dst);
2689         if (imm8 == 1) {
2690             emitByte(0xD1);
2691             emitModRM(4, dst);
2692         } else {
2693             emitByte(0xC1);
2694             emitModRM(4, dst);
2695             emitByte(imm8);
2696         }
2697     }
2698 
2699     public final void shll(Register dst) {
2700         // Multiply dst by 2, CL times.
2701         prefix(dst);
2702         emitByte(0xD3);
2703         emitModRM(4, dst);
2704     }
2705 
2706     public final void shrl(Register dst, int imm8) {
2707         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2708         prefix(dst);
2709         emitByte(0xC1);
2710         emitModRM(5, dst);
2711         emitByte(imm8);
2712     }
2713 
2714     public final void shrl(Register dst) {
2715         // Unsigned divide dst by 2, CL times.
2716         prefix(dst);
2717         emitByte(0xD3);
2718         emitModRM(5, dst);
2719     }
2720 
2721     public final void subl(AMD64Address dst, int imm32) {
2722         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2723     }
2724 
2725     public final void subl(Register dst, int imm32) {
2726         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2727     }
2728 
2729     public final void subl(Register dst, Register src) {
2730         SUB.rmOp.emit(this, DWORD, dst, src);
2731     }
2732 
2733     public final void subpd(Register dst, Register src) {
2734         SSEOp.SUB.emit(this, PD, dst, src);
2735     }
2736 
2737     public final void subsd(Register dst, Register src) {
2738         SSEOp.SUB.emit(this, SD, dst, src);
2739     }
2740 
2741     public final void subsd(Register dst, AMD64Address src) {
2742         SSEOp.SUB.emit(this, SD, dst, src);
2743     }
2744 
2745     public final void testl(Register dst, int imm32) {
2746         // not using emitArith because test
2747         // doesn't support sign-extension of
2748         // 8bit operands
2749         if (dst.encoding == 0) {
2750             emitByte(0xA9);
2751         } else {
2752             prefix(dst);
2753             emitByte(0xF7);
2754             emitModRM(0, dst);
2755         }
2756         emitInt(imm32);
2757     }
2758 
2759     public final void testl(Register dst, Register src) {
2760         prefix(dst, src);
2761         emitByte(0x85);
2762         emitModRM(dst, src);
2763     }
2764 
2765     public final void testl(Register dst, AMD64Address src) {
2766         prefix(src, dst);
2767         emitByte(0x85);
2768         emitOperandHelper(dst, src, 0);
2769     }
2770 
2771     public final void unpckhpd(Register dst, Register src) {
2772         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2773         simdPrefix(dst, dst, src, PD, P_0F, false);
2774         emitByte(0x15);
2775         emitModRM(dst, src);
2776     }
2777 
2778     public final void unpcklpd(Register dst, Register src) {
2779         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2780         simdPrefix(dst, dst, src, PD, P_0F, false);
2781         emitByte(0x14);
2782         emitModRM(dst, src);
2783     }
2784 
2785     public final void xorl(Register dst, Register src) {
2786         XOR.rmOp.emit(this, DWORD, dst, src);
2787     }
2788 
2789     public final void xorpd(Register dst, Register src) {
2790         SSEOp.XOR.emit(this, PD, dst, src);
2791     }
2792 
2793     public final void xorps(Register dst, Register src) {
2794         SSEOp.XOR.emit(this, PS, dst, src);
2795     }
2796 
2797     protected final void decl(Register dst) {
2798         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2799         prefix(dst);
2800         emitByte(0xFF);
2801         emitModRM(1, dst);
2802     }
2803 
2804     protected final void incl(Register dst) {
2805         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2806         prefix(dst);
2807         emitByte(0xFF);
2808         emitModRM(0, dst);
2809     }
2810 
2811     public final void addq(Register dst, int imm32) {
2812         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2813     }
2814 
2815     public final void addq(AMD64Address dst, int imm32) {
2816         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2817     }
2818 
2819     public final void addq(Register dst, Register src) {
2820         ADD.rmOp.emit(this, QWORD, dst, src);
2821     }
2822 
2823     public final void addq(AMD64Address dst, Register src) {
2824         ADD.mrOp.emit(this, QWORD, dst, src);
2825     }
2826 
2827     public final void andq(Register dst, int imm32) {
2828         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2829     }
2830 
2831     public final void bsrq(Register dst, Register src) {
2832         prefixq(dst, src);
2833         emitByte(0x0F);
2834         emitByte(0xBD);
2835         emitModRM(dst, src);
2836     }
2837 
2838     public final void bswapq(Register reg) {
2839         prefixq(reg);
2840         emitByte(0x0F);
2841         emitByte(0xC8 + encode(reg));
2842     }
2843 
2844     public final void cdqq() {
2845         rexw();
2846         emitByte(0x99);
2847     }
2848 
2849     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
2850         prefixq(dst, src);
2851         emitByte(0x0F);
2852         emitByte(0x40 | cc.getValue());
2853         emitModRM(dst, src);
2854     }
2855 
2856     public final void setb(ConditionFlag cc, Register dst) {
2857         prefix(dst, true);
2858         emitByte(0x0F);
2859         emitByte(0x90 | cc.getValue());
2860         emitModRM(0, dst);
2861     }
2862 
2863     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
2864         prefixq(src, dst);
2865         emitByte(0x0F);
2866         emitByte(0x40 | cc.getValue());
2867         emitOperandHelper(dst, src, 0);
2868     }
2869 
2870     public final void cmpq(Register dst, int imm32) {
2871         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2872     }
2873 
2874     public final void cmpq(Register dst, Register src) {
2875         CMP.rmOp.emit(this, QWORD, dst, src);
2876     }
2877 
2878     public final void cmpq(Register dst, AMD64Address src) {
2879         CMP.rmOp.emit(this, QWORD, dst, src);
2880     }
2881 
2882     public final void cmpxchgq(Register reg, AMD64Address adr) {
2883         prefixq(adr, reg);
2884         emitByte(0x0F);
2885         emitByte(0xB1);
2886         emitOperandHelper(reg, adr, 0);
2887     }
2888 
2889     public final void cvtdq2pd(Register dst, Register src) {
2890         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2891         simdPrefix(dst, Register.None, src, SS, P_0F, false);
2892         emitByte(0xE6);
2893         emitModRM(dst, src);
2894     }
2895 
2896     public final void cvtsi2sdq(Register dst, Register src) {
2897         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
2898     }
2899 
2900     public final void cvttsd2siq(Register dst, Register src) {
2901         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
2902     }
2903 
2904     public final void cvttpd2dq(Register dst, Register src) {
2905         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2906         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2907         emitByte(0xE6);
2908         emitModRM(dst, src);
2909     }
2910 
2911     public final void decq(Register dst) {
2912         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2913         prefixq(dst);
2914         emitByte(0xFF);
2915         emitModRM(1, dst);
2916     }
2917 
2918     public final void decq(AMD64Address dst) {
2919         DEC.emit(this, QWORD, dst);
2920     }
2921 
2922     public final void imulq(Register dst, Register src) {
2923         prefixq(dst, src);
2924         emitByte(0x0F);
2925         emitByte(0xAF);
2926         emitModRM(dst, src);
2927     }
2928 
2929     public final void incq(Register dst) {
2930         // Don't use it directly. Use Macroincrementq() instead.
2931         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2932         prefixq(dst);
2933         emitByte(0xFF);
2934         emitModRM(0, dst);
2935     }
2936 
2937     public final void incq(AMD64Address dst) {
2938         INC.emit(this, QWORD, dst);
2939     }
2940 
2941     public final void movq(Register dst, long imm64) {
2942         movq(dst, imm64, false);
2943     }
2944 
2945     public final void movq(Register dst, long imm64, boolean annotateImm) {
2946         int insnPos = position();
2947         prefixq(dst);
2948         emitByte(0xB8 + encode(dst));
2949         int immPos = position();
2950         emitLong(imm64);
2951         int nextInsnPos = position();
2952         if (annotateImm && codePatchingAnnotationConsumer != null) {
2953             codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2954         }
2955     }
2956 
2957     public final void movslq(Register dst, int imm32) {
2958         prefixq(dst);
2959         emitByte(0xC7);
2960         emitModRM(0, dst);
2961         emitInt(imm32);
2962     }
2963 
2964     public final void movdq(Register dst, AMD64Address src) {
2965         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
2966     }
2967 
2968     public final void movdq(AMD64Address dst, Register src) {
2969         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
2970     }
2971 
2972     public final void movdq(Register dst, Register src) {
2973         if (dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(CPU)) {
2974             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
2975         } else if (src.getRegisterCategory().equals(XMM) && dst.getRegisterCategory().equals(CPU)) {
2976             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
2977         } else {
2978             throw new InternalError("should not reach here");
2979         }
2980     }
2981 
2982     public final void movdl(Register dst, Register src) {
2983         if (dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(CPU)) {
2984             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
2985         } else if (src.getRegisterCategory().equals(XMM) && dst.getRegisterCategory().equals(CPU)) {
2986             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
2987         } else {
2988             throw new InternalError("should not reach here");
2989         }
2990     }
2991 
2992     public final void movdl(Register dst, AMD64Address src) {
2993         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
2994     }
2995 
2996     public final void movddup(Register dst, Register src) {
2997         assert supports(CPUFeature.SSE3);
2998         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2999         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3000         emitByte(0x12);
3001         emitModRM(dst, src);
3002     }
3003 
3004     public final void movdqu(Register dst, AMD64Address src) {
3005         assert dst.getRegisterCategory().equals(XMM);
3006         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3007         emitByte(0x6F);
3008         emitOperandHelper(dst, src, 0);
3009     }
3010 
3011     public final void movdqu(Register dst, Register src) {
3012         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
3013         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3014         emitByte(0x6F);
3015         emitModRM(dst, src);
3016     }
3017 
3018     public final void movslq(AMD64Address dst, int imm32) {
3019         prefixq(dst);
3020         emitByte(0xC7);
3021         emitOperandHelper(0, dst, 4);
3022         emitInt(imm32);
3023     }
3024 
3025     public final void movslq(Register dst, AMD64Address src) {
3026         prefixq(src, dst);
3027         emitByte(0x63);
3028         emitOperandHelper(dst, src, 0);
3029     }
3030 
3031     public final void movslq(Register dst, Register src) {
3032         prefixq(dst, src);
3033         emitByte(0x63);
3034         emitModRM(dst, src);
3035     }
3036 
3037     public final void negq(Register dst) {
3038         prefixq(dst);
3039         emitByte(0xF7);
3040         emitModRM(3, dst);
3041     }
3042 
3043     public final void orq(Register dst, Register src) {
3044         OR.rmOp.emit(this, QWORD, dst, src);
3045     }
3046 
3047     public final void shlq(Register dst, int imm8) {
3048         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3049         prefixq(dst);
3050         if (imm8 == 1) {
3051             emitByte(0xD1);
3052             emitModRM(4, dst);
3053         } else {
3054             emitByte(0xC1);
3055             emitModRM(4, dst);
3056             emitByte(imm8);
3057         }
3058     }
3059 
3060     public final void shlq(Register dst) {
3061         // Multiply dst by 2, CL times.
3062         prefixq(dst);
3063         emitByte(0xD3);
3064         emitModRM(4, dst);
3065     }
3066 
3067     public final void shrq(Register dst, int imm8) {
3068         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3069         prefixq(dst);
3070         if (imm8 == 1) {
3071             emitByte(0xD1);
3072             emitModRM(5, dst);
3073         } else {
3074             emitByte(0xC1);
3075             emitModRM(5, dst);
3076             emitByte(imm8);
3077         }
3078     }
3079 
3080     public final void shrq(Register dst) {
3081         prefixq(dst);
3082         emitByte(0xD3);
3083         // Unsigned divide dst by 2, CL times.
3084         emitModRM(5, dst);
3085     }
3086 
3087     public final void sbbq(Register dst, Register src) {
3088         SBB.rmOp.emit(this, QWORD, dst, src);
3089     }
3090 
3091     public final void subq(Register dst, int imm32) {
3092         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3093     }
3094 
3095     public final void subq(AMD64Address dst, int imm32) {
3096         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3097     }
3098 
3099     public final void subqWide(Register dst, int imm32) {
3100         // don't use the sign-extending version, forcing a 32-bit immediate
3101         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3102     }
3103 
3104     public final void subq(Register dst, Register src) {
3105         SUB.rmOp.emit(this, QWORD, dst, src);
3106     }
3107 
3108     public final void testq(Register dst, Register src) {
3109         prefixq(dst, src);
3110         emitByte(0x85);
3111         emitModRM(dst, src);
3112     }
3113 
3114     public final void btrq(Register src, int imm8) {
3115         prefixq(src);
3116         emitByte(0x0F);
3117         emitByte(0xBA);
3118         emitModRM(6, src);
3119         emitByte(imm8);
3120     }
3121 
3122     public final void xaddb(AMD64Address dst, Register src) {
3123         prefixb(dst, src);
3124         emitByte(0x0F);
3125         emitByte(0xC0);
3126         emitOperandHelper(src, dst, 0);
3127     }
3128 
3129     public final void xaddw(AMD64Address dst, Register src) {
3130         emitByte(0x66); // Switch to 16-bit mode.
3131         prefix(dst, src);
3132         emitByte(0x0F);
3133         emitByte(0xC1);
3134         emitOperandHelper(src, dst, 0);
3135     }
3136 
3137     public final void xaddl(AMD64Address dst, Register src) {
3138         prefix(dst, src);
3139         emitByte(0x0F);
3140         emitByte(0xC1);
3141         emitOperandHelper(src, dst, 0);
3142     }
3143 
3144     public final void xaddq(AMD64Address dst, Register src) {
3145         prefixq(dst, src);
3146         emitByte(0x0F);
3147         emitByte(0xC1);
3148         emitOperandHelper(src, dst, 0);
3149     }
3150 
3151     public final void xchgb(Register dst, AMD64Address src) {
3152         prefixb(src, dst);
3153         emitByte(0x86);
3154         emitOperandHelper(dst, src, 0);
3155     }
3156 
3157     public final void xchgw(Register dst, AMD64Address src) {
3158         emitByte(0x66);
3159         prefix(src, dst);
3160         emitByte(0x87);
3161         emitOperandHelper(dst, src, 0);
3162     }
3163 
3164     public final void xchgl(Register dst, AMD64Address src) {
3165         prefix(src, dst);
3166         emitByte(0x87);
3167         emitOperandHelper(dst, src, 0);
3168     }
3169 
3170     public final void xchgq(Register dst, AMD64Address src) {
3171         prefixq(src, dst);
3172         emitByte(0x87);
3173         emitOperandHelper(dst, src, 0);
3174     }
3175 
3176     public final void membar(int barriers) {
3177         if (target.isMP) {
3178             // We only have to handle StoreLoad
3179             if ((barriers & STORE_LOAD) != 0) {
3180                 // All usable chips support "locked" instructions which suffice
3181                 // as barriers, and are much faster than the alternative of
3182                 // using cpuid instruction. We use here a locked add [rsp],0.
3183                 // This is conveniently otherwise a no-op except for blowing
3184                 // flags.
3185                 // Any change to this code may need to revisit other places in
3186                 // the code where this idiom is used, in particular the
3187                 // orderAccess code.
3188                 lock();
3189                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3190             }
3191         }
3192     }
3193 
3194     @Override
3195     protected final void patchJumpTarget(int branch, int branchTarget) {
3196         int op = getByte(branch);
3197         assert op == 0xE8 // call
3198                         ||
3199                         op == 0x00 // jump table entry
3200                         || op == 0xE9 // jmp
3201                         || op == 0xEB // short jmp
3202                         || (op & 0xF0) == 0x70 // short jcc
3203                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3204         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3205 
3206         if (op == 0x00) {
3207             int offsetToJumpTableBase = getShort(branch + 1);
3208             int jumpTableBase = branch - offsetToJumpTableBase;
3209             int imm32 = branchTarget - jumpTableBase;
3210             emitInt(imm32, branch);
3211         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3212 
3213             // short offset operators (jmp and jcc)
3214             final int imm8 = branchTarget - (branch + 2);
3215             /*
3216              * Since a wrongly patched short branch can potentially lead to working but really bad
3217              * behaving code we should always fail with an exception instead of having an assert.
3218              */
3219             if (!NumUtil.isByte(imm8)) {
3220                 throw new InternalError("branch displacement out of range: " + imm8);
3221             }
3222             emitByte(imm8, branch + 1);
3223 
3224         } else {
3225 
3226             int off = 1;
3227             if (op == 0x0F) {
3228                 off = 2;
3229             }
3230 
3231             int imm32 = branchTarget - (branch + 4 + off);
3232             emitInt(imm32, branch + off);
3233         }
3234     }
3235 
3236     public void nullCheck(AMD64Address address) {
3237         testl(AMD64.rax, address);
3238     }
3239 
3240     @Override
3241     public void align(int modulus) {
3242         if (position() % modulus != 0) {
3243             nop(modulus - (position() % modulus));
3244         }
3245     }
3246 
3247     /**
3248      * Emits a direct call instruction. Note that the actual call target is not specified, because
3249      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3250      * responsible to add the call address to the appropriate patching tables.
3251      */
3252     public final void call() {
3253         annotatePatchingImmediate(1, 4);
3254         emitByte(0xE8);
3255         emitInt(0);
3256     }
3257 
3258     public final void call(Register src) {
3259         prefix(src);
3260         emitByte(0xFF);
3261         emitModRM(2, src);
3262     }
3263 
3264     public final void int3() {
3265         emitByte(0xCC);
3266     }
3267 
3268     public final void pause() {
3269         emitByte(0xF3);
3270         emitByte(0x90);
3271     }
3272 
3273     private void emitx87(int b1, int b2, int i) {
3274         assert 0 <= i && i < 8 : "illegal stack offset";
3275         emitByte(b1);
3276         emitByte(b2 + i);
3277     }
3278 
3279     public final void fldd(AMD64Address src) {
3280         emitByte(0xDD);
3281         emitOperandHelper(0, src, 0);
3282     }
3283 
3284     public final void flds(AMD64Address src) {
3285         emitByte(0xD9);
3286         emitOperandHelper(0, src, 0);
3287     }
3288 
3289     public final void fldln2() {
3290         emitByte(0xD9);
3291         emitByte(0xED);
3292     }
3293 
3294     public final void fldlg2() {
3295         emitByte(0xD9);
3296         emitByte(0xEC);
3297     }
3298 
3299     public final void fyl2x() {
3300         emitByte(0xD9);
3301         emitByte(0xF1);
3302     }
3303 
3304     public final void fstps(AMD64Address src) {
3305         emitByte(0xD9);
3306         emitOperandHelper(3, src, 0);
3307     }
3308 
3309     public final void fstpd(AMD64Address src) {
3310         emitByte(0xDD);
3311         emitOperandHelper(3, src, 0);
3312     }
3313 
3314     private void emitFPUArith(int b1, int b2, int i) {
3315         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3316         emitByte(b1);
3317         emitByte(b2 + i);
3318     }
3319 
3320     public void ffree(int i) {
3321         emitFPUArith(0xDD, 0xC0, i);
3322     }
3323 
3324     public void fincstp() {
3325         emitByte(0xD9);
3326         emitByte(0xF7);
3327     }
3328 
3329     public void fxch(int i) {
3330         emitFPUArith(0xD9, 0xC8, i);
3331     }
3332 
3333     public void fnstswAX() {
3334         emitByte(0xDF);
3335         emitByte(0xE0);
3336     }
3337 
3338     public void fwait() {
3339         emitByte(0x9B);
3340     }
3341 
3342     public void fprem() {
3343         emitByte(0xD9);
3344         emitByte(0xF8);
3345     }
3346 
3347     public final void fsin() {
3348         emitByte(0xD9);
3349         emitByte(0xFE);
3350     }
3351 
3352     public final void fcos() {
3353         emitByte(0xD9);
3354         emitByte(0xFF);
3355     }
3356 
3357     public final void fptan() {
3358         emitByte(0xD9);
3359         emitByte(0xF2);
3360     }
3361 
3362     public final void fstp(int i) {
3363         emitx87(0xDD, 0xD8, i);
3364     }
3365 
3366     @Override
3367     public AMD64Address makeAddress(Register base, int displacement) {
3368         return new AMD64Address(base, displacement);
3369     }
3370 
3371     @Override
3372     public AMD64Address getPlaceholder(int instructionStartPosition) {
3373         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3374     }
3375 
3376     private void prefetchPrefix(AMD64Address src) {
3377         prefix(src);
3378         emitByte(0x0F);
3379     }
3380 
3381     public void prefetchnta(AMD64Address src) {
3382         prefetchPrefix(src);
3383         emitByte(0x18);
3384         emitOperandHelper(0, src, 0);
3385     }
3386 
3387     void prefetchr(AMD64Address src) {
3388         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3389         prefetchPrefix(src);
3390         emitByte(0x0D);
3391         emitOperandHelper(0, src, 0);
3392     }
3393 
3394     public void prefetcht0(AMD64Address src) {
3395         assert supports(CPUFeature.SSE);
3396         prefetchPrefix(src);
3397         emitByte(0x18);
3398         emitOperandHelper(1, src, 0);
3399     }
3400 
3401     public void prefetcht1(AMD64Address src) {
3402         assert supports(CPUFeature.SSE);
3403         prefetchPrefix(src);
3404         emitByte(0x18);
3405         emitOperandHelper(2, src, 0);
3406     }
3407 
3408     public void prefetcht2(AMD64Address src) {
3409         assert supports(CPUFeature.SSE);
3410         prefix(src);
3411         emitByte(0x0f);
3412         emitByte(0x18);
3413         emitOperandHelper(3, src, 0);
3414     }
3415 
3416     public void prefetchw(AMD64Address src) {
3417         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3418         prefix(src);
3419         emitByte(0x0f);
3420         emitByte(0x0D);
3421         emitOperandHelper(1, src, 0);
3422     }
3423 
3424     public void rdtsc() {
3425         emitByte(0x0F);
3426         emitByte(0x31);
3427     }
3428 
3429     /**
3430      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3431      * to crash the program (debugging etc.).
3432      */
3433     public void illegal() {
3434         emitByte(0x0f);
3435         emitByte(0x0b);
3436     }
3437 
3438     public void lfence() {
3439         emitByte(0x0f);
3440         emitByte(0xae);
3441         emitByte(0xe8);
3442     }
3443 
3444     public final void vptest(Register dst, Register src) {
3445         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3446     }
3447 
3448     public final void vpxor(Register dst, Register nds, Register src) {
3449         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3450     }
3451 
3452     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3453         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3454     }
3455 
3456     public final void vmovdqu(Register dst, AMD64Address src) {
3457         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3458     }
3459 
3460     public final void vpmovzxbw(Register dst, AMD64Address src) {
3461         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3462     }
3463 
3464     public final void vzeroupper() {
3465         emitVEX(L128, P_, M_0F, W0, 0, 0);
3466         emitByte(0x77);
3467     }
3468 
3469     // This instruction produces ZF or CF flags
3470     public final void kortestq(Register src1, Register src2) {
3471         assert supports(CPUFeature.AVX512BW);
3472         assert src1.getRegisterCategory().equals(MASK) && src2.getRegisterCategory().equals(MASK);
3473         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1);
3474         emitByte(0x98);
3475         emitModRM(src1, src2);
3476     }
3477 
3478     public final void kmovq(Register dst, Register src) {
3479         assert supports(CPUFeature.AVX512BW);
3480         assert dst.getRegisterCategory().equals(MASK) || dst.getRegisterCategory().equals(CPU);
3481         assert src.getRegisterCategory().equals(MASK) || src.getRegisterCategory().equals(CPU);
3482         assert !(dst.getRegisterCategory().equals(CPU) && src.getRegisterCategory().equals(CPU));
3483 
3484         if (dst.getRegisterCategory().equals(MASK)) {
3485             if (src.getRegisterCategory().equals(MASK)) {
3486                 // kmovq(KRegister dst, KRegister src)
3487                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1);
3488                 emitByte(0x90);
3489                 emitModRM(dst, src);
3490             } else {
3491                 // kmovq(KRegister dst, Register src)
3492                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1);
3493                 emitByte(0x92);
3494                 emitModRM(dst, src);
3495             }
3496         } else {
3497             if (src.getRegisterCategory().equals(MASK)) {
3498                 // kmovq(Register dst, KRegister src)
3499                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1);
3500                 emitByte(0x93);
3501                 emitModRM(dst, src);
3502             } else {
3503                 throw GraalError.shouldNotReachHere();
3504             }
3505         }
3506     }
3507 
3508     public final void evmovdqu64(Register dst, AMD64Address src) {
3509         assert supports(CPUFeature.AVX512F);
3510         assert dst.getRegisterCategory().equals(XMM);
3511         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3512         emitByte(0x6F);
3513         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3514     }
3515 
3516     public final void evpmovzxbw(Register dst, AMD64Address src) {
3517         assert supports(CPUFeature.AVX512BW);
3518         assert dst.getRegisterCategory().equals(XMM);
3519         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3520         emitByte(0x30);
3521         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3522     }
3523 
3524     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3525         assert supports(CPUFeature.AVX512BW);
3526         assert kdst.getRegisterCategory().equals(MASK) && nds.getRegisterCategory().equals(XMM);
3527         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3528         emitByte(0x74);
3529         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3530     }
3531 }