1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  68 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  69 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  70 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  71 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  72 
  73 import java.util.EnumSet;
  74 
  75 import org.graalvm.compiler.asm.Label;
  76 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  77 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  78 import org.graalvm.compiler.core.common.calc.Condition;
  79 import org.graalvm.compiler.debug.GraalError;
  80 
  81 import jdk.vm.ci.amd64.AMD64;
  82 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  83 import jdk.vm.ci.code.Register;
  84 import jdk.vm.ci.code.Register.RegisterCategory;
  85 import jdk.vm.ci.code.TargetDescription;
  86 
  87 /**
  88  * This class implements an assembler that can encode most X86 instructions.
  89  */
  90 public class AMD64Assembler extends AMD64BaseAssembler {
  91 
  92     /**
  93      * Constructs an assembler for the AMD64 architecture.
  94      */
  95     public AMD64Assembler(TargetDescription target) {
  96         super(target);
  97     }
  98 
  99     /**
 100      * The x86 condition codes used for conditional jumps/moves.
 101      */
 102     public enum ConditionFlag {
 103         Zero(0x4, "|zero|"),
 104         NotZero(0x5, "|nzero|"),
 105         Equal(0x4, "="),
 106         NotEqual(0x5, "!="),
 107         Less(0xc, "<"),
 108         LessEqual(0xe, "<="),
 109         Greater(0xf, ">"),
 110         GreaterEqual(0xd, ">="),
 111         Below(0x2, "|<|"),
 112         BelowEqual(0x6, "|<=|"),
 113         Above(0x7, "|>|"),
 114         AboveEqual(0x3, "|>=|"),
 115         Overflow(0x0, "|of|"),
 116         NoOverflow(0x1, "|nof|"),
 117         CarrySet(0x2, "|carry|"),
 118         CarryClear(0x3, "|ncarry|"),
 119         Negative(0x8, "|neg|"),
 120         Positive(0x9, "|pos|"),
 121         Parity(0xa, "|par|"),
 122         NoParity(0xb, "|npar|");
 123 
 124         private final int value;
 125         private final String operator;
 126 
 127         ConditionFlag(int value, String operator) {
 128             this.value = value;
 129             this.operator = operator;
 130         }
 131 
 132         public ConditionFlag negate() {
 133             switch (this) {
 134                 case Zero:
 135                     return NotZero;
 136                 case NotZero:
 137                     return Zero;
 138                 case Equal:
 139                     return NotEqual;
 140                 case NotEqual:
 141                     return Equal;
 142                 case Less:
 143                     return GreaterEqual;
 144                 case LessEqual:
 145                     return Greater;
 146                 case Greater:
 147                     return LessEqual;
 148                 case GreaterEqual:
 149                     return Less;
 150                 case Below:
 151                     return AboveEqual;
 152                 case BelowEqual:
 153                     return Above;
 154                 case Above:
 155                     return BelowEqual;
 156                 case AboveEqual:
 157                     return Below;
 158                 case Overflow:
 159                     return NoOverflow;
 160                 case NoOverflow:
 161                     return Overflow;
 162                 case CarrySet:
 163                     return CarryClear;
 164                 case CarryClear:
 165                     return CarrySet;
 166                 case Negative:
 167                     return Positive;
 168                 case Positive:
 169                     return Negative;
 170                 case Parity:
 171                     return NoParity;
 172                 case NoParity:
 173                     return Parity;
 174             }
 175             throw new IllegalArgumentException();
 176         }
 177 
 178         public int getValue() {
 179             return value;
 180         }
 181 
 182         @Override
 183         public String toString() {
 184             return operator;
 185         }
 186     }
 187 
 188     /**
 189      * Operand size and register type constraints.
 190      */
 191     private enum OpAssertion {
 192         ByteAssertion(CPU, CPU, BYTE),
 193         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 194         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 195         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 196         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 197         QwordAssertion(CPU, CPU, QWORD),
 198         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 199         PackedFloatAssertion(XMM, XMM, PS, PD),
 200         SingleAssertion(XMM, XMM, SS),
 201         DoubleAssertion(XMM, XMM, SD),
 202         PackedDoubleAssertion(XMM, XMM, PD),
 203         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 204         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 205 
 206         private final RegisterCategory resultCategory;
 207         private final RegisterCategory inputCategory;
 208         private final OperandSize[] allowedSizes;
 209 
 210         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 211             this.resultCategory = resultCategory;
 212             this.inputCategory = inputCategory;
 213             this.allowedSizes = allowedSizes;
 214         }
 215 
 216         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 217             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 218             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 219 
 220             for (OperandSize s : allowedSizes) {
 221                 if (size == s) {
 222                     return true;
 223                 }
 224             }
 225 
 226             assert false : "invalid operand size " + size + " used in " + op;
 227             return false;
 228         }
 229 
 230     }
 231 
 232     protected static final int P_0F = 0x0F;
 233     protected static final int P_0F38 = 0x380F;
 234     protected static final int P_0F3A = 0x3A0F;
 235 
 236     /**
 237      * Base class for AMD64 opcodes.
 238      */
 239     public static class AMD64Op {
 240 
 241         private final String opcode;
 242 
 243         protected final int prefix1;
 244         protected final int prefix2;
 245         protected final int op;
 246 
 247         private final boolean dstIsByte;
 248         private final boolean srcIsByte;
 249 
 250         private final OpAssertion assertion;
 251         private final CPUFeature feature;
 252 
 253         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 254             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 255         }
 256 
 257         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 258             this.opcode = opcode;
 259             this.prefix1 = prefix1;
 260             this.prefix2 = prefix2;
 261             this.op = op;
 262 
 263             this.dstIsByte = dstIsByte;
 264             this.srcIsByte = srcIsByte;
 265 
 266             this.assertion = assertion;
 267             this.feature = feature;
 268         }
 269 
 270         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 271             if (prefix1 != 0) {
 272                 asm.emitByte(prefix1);
 273             }
 274             if (size.getSizePrefix() != 0) {
 275                 asm.emitByte(size.getSizePrefix());
 276             }
 277             int rexPrefix = 0x40 | rxb;
 278             if (size == QWORD) {
 279                 rexPrefix |= 0x08;
 280             }
 281             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 282                 asm.emitByte(rexPrefix);
 283             }
 284             if (prefix2 > 0xFF) {
 285                 asm.emitShort(prefix2);
 286             } else if (prefix2 > 0) {
 287                 asm.emitByte(prefix2);
 288             }
 289             asm.emitByte(op);
 290         }
 291 
 292         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 293             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 294             assert assertion.checkOperands(this, size, resultReg, inputReg);
 295             return true;
 296         }
 297 
 298         public OperandSize[] getAllowedSizes() {
 299             return assertion.allowedSizes;
 300         }
 301 
 302         protected final boolean isSSEInstruction() {
 303             if (feature == null) {
 304                 return false;
 305             }
 306             switch (feature) {
 307                 case SSE:
 308                 case SSE2:
 309                 case SSE3:
 310                 case SSSE3:
 311                 case SSE4A:
 312                 case SSE4_1:
 313                 case SSE4_2:
 314                     return true;
 315                 default:
 316                     return false;
 317             }
 318         }
 319 
 320         public final OpAssertion getAssertion() {
 321             return assertion;
 322         }
 323 
 324         @Override
 325         public String toString() {
 326             return opcode;
 327         }
 328     }
 329 
 330     /**
 331      * Base class for AMD64 opcodes with immediate operands.
 332      */
 333     public static class AMD64ImmOp extends AMD64Op {
 334 
 335         private final boolean immIsByte;
 336 
 337         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 338             this(opcode, immIsByte, prefix, op, assertion, null);
 339         }
 340 
 341         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 342             super(opcode, 0, prefix, op, assertion, feature);
 343             this.immIsByte = immIsByte;
 344         }
 345 
 346         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 347             if (immIsByte) {
 348                 assert imm == (byte) imm;
 349                 asm.emitByte(imm);
 350             } else {
 351                 size.emitImmediate(asm, imm);
 352             }
 353         }
 354 
 355         protected final int immediateSize(OperandSize size) {
 356             if (immIsByte) {
 357                 return 1;
 358             } else {
 359                 return size.getBytes();
 360             }
 361         }
 362     }
 363 
 364     /**
 365      * Opcode with operand order of either RM or MR for 2 address forms.
 366      */
 367     public abstract static class AMD64RROp extends AMD64Op {
 368 
 369         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 370             super(opcode, prefix1, prefix2, op, assertion, feature);
 371         }
 372 
 373         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 374             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 375         }
 376 
 377         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 378     }
 379 
 380     /**
 381      * Opcode with operand order of RM.
 382      */
 383     public static class AMD64RMOp extends AMD64RROp {
 384         // @formatter:off
 385         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 386         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 387         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 388         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 389         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 390         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 391         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 392         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 393         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 394         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 395         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 396         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 399         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 400         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 401         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 402 
 403         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 404         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 405         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 407         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408 
 409         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 410         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 411         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 412         // @formatter:on
 413 
 414         protected AMD64RMOp(String opcode, int op) {
 415             this(opcode, 0, op);
 416         }
 417 
 418         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 419             this(opcode, 0, op, assertion);
 420         }
 421 
 422         protected AMD64RMOp(String opcode, int prefix, int op) {
 423             this(opcode, 0, prefix, op, null);
 424         }
 425 
 426         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 427             this(opcode, 0, prefix, op, assertion, null);
 428         }
 429 
 430         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 431             this(opcode, 0, prefix, op, assertion, feature);
 432         }
 433 
 434         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 435             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 436         }
 437 
 438         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 439             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 440         }
 441 
 442         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 443             super(opcode, prefix1, prefix2, op, assertion, feature);
 444         }
 445 
 446         @Override
 447         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 448             assert verify(asm, size, dst, src);
 449             if (isSSEInstruction()) {
 450                 Register nds = Register.None;
 451                 switch (op) {
 452                     case 0x10:
 453                     case 0x51:
 454                         if ((size == SS) || (size == SD)) {
 455                             nds = dst;
 456                         }
 457                         break;
 458                     case 0x2A:
 459                     case 0x54:
 460                     case 0x55:
 461                     case 0x56:
 462                     case 0x57:
 463                     case 0x58:
 464                     case 0x59:
 465                     case 0x5A:
 466                     case 0x5C:
 467                     case 0x5D:
 468                     case 0x5E:
 469                     case 0x5F:
 470                         nds = dst;
 471                         break;
 472                     default:
 473                         break;
 474                 }
 475                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 476                 asm.emitByte(op);
 477                 asm.emitModRM(dst, src);
 478             } else {
 479                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 480                 asm.emitModRM(dst, src);
 481             }
 482         }
 483 
 484         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 485             assert verify(asm, size, dst, null);
 486             if (isSSEInstruction()) {
 487                 Register nds = Register.None;
 488                 switch (op) {
 489                     case 0x51:
 490                         if ((size == SS) || (size == SD)) {
 491                             nds = dst;
 492                         }
 493                         break;
 494                     case 0x2A:
 495                     case 0x54:
 496                     case 0x55:
 497                     case 0x56:
 498                     case 0x57:
 499                     case 0x58:
 500                     case 0x59:
 501                     case 0x5A:
 502                     case 0x5C:
 503                     case 0x5D:
 504                     case 0x5E:
 505                     case 0x5F:
 506                         nds = dst;
 507                         break;
 508                     default:
 509                         break;
 510                 }
 511                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 512                 asm.emitByte(op);
 513                 asm.emitOperandHelper(dst, src, 0);
 514             } else {
 515                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 516                 asm.emitOperandHelper(dst, src, 0);
 517             }
 518         }
 519     }
 520 
 521     /**
 522      * Opcode with operand order of MR.
 523      */
 524     public static class AMD64MROp extends AMD64RROp {
 525         // @formatter:off
 526         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 527         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 528 
 529         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 530         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 531         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 532         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533 
 534         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 535         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 536         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         // @formatter:on
 538 
 539         protected AMD64MROp(String opcode, int op) {
 540             this(opcode, 0, op);
 541         }
 542 
 543         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 544             this(opcode, 0, op, assertion);
 545         }
 546 
 547         protected AMD64MROp(String opcode, int prefix, int op) {
 548             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 549         }
 550 
 551         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 552             this(opcode, prefix, op, assertion, null);
 553         }
 554 
 555         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 556             this(opcode, 0, prefix, op, assertion, feature);
 557         }
 558 
 559         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 560             super(opcode, prefix1, prefix2, op, assertion, feature);
 561         }
 562 
 563         @Override
 564         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 565             assert verify(asm, size, src, dst);
 566             if (isSSEInstruction()) {
 567                 Register nds = Register.None;
 568                 switch (op) {
 569                     case 0x11:
 570                         if ((size == SS) || (size == SD)) {
 571                             nds = src;
 572                         }
 573                         break;
 574                     default:
 575                         break;
 576                 }
 577                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 578                 asm.emitByte(op);
 579                 asm.emitModRM(src, dst);
 580             } else {
 581                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 582                 asm.emitModRM(src, dst);
 583             }
 584         }
 585 
 586         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 587             assert verify(asm, size, src, null);
 588             if (isSSEInstruction()) {
 589                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 590                 asm.emitByte(op);
 591             } else {
 592                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 593             }
 594             asm.emitOperandHelper(src, dst, 0);
 595         }
 596     }
 597 
 598     /**
 599      * Opcodes with operand order of M.
 600      */
 601     public static class AMD64MOp extends AMD64Op {
 602         // @formatter:off
 603         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 604         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 605         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 606         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 607         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 608         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 609         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 610         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 611         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 612         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 613         // @formatter:on
 614 
 615         private final int ext;
 616 
 617         protected AMD64MOp(String opcode, int op, int ext) {
 618             this(opcode, 0, op, ext);
 619         }
 620 
 621         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 622             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 623         }
 624 
 625         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 626             this(opcode, 0, op, ext, assertion);
 627         }
 628 
 629         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 630             super(opcode, 0, prefix, op, assertion, null);
 631             this.ext = ext;
 632         }
 633 
 634         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 635             assert verify(asm, size, dst, null);
 636             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 637             asm.emitModRM(ext, dst);
 638         }
 639 
 640         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 641             assert verify(asm, size, null, null);
 642             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 643             asm.emitOperandHelper(ext, dst, 0);
 644         }
 645     }
 646 
 647     /**
 648      * Opcodes with operand order of MI.
 649      */
 650     public static class AMD64MIOp extends AMD64ImmOp {
 651         // @formatter:off
 652         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 653         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 654         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 655         // @formatter:on
 656 
 657         private final int ext;
 658 
 659         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 660             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 661         }
 662 
 663         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 664             this(opcode, immIsByte, 0, op, ext, assertion);
 665         }
 666 
 667         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 668             super(opcode, immIsByte, prefix, op, assertion);
 669             this.ext = ext;
 670         }
 671 
 672         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 673             emit(asm, size, dst, imm, false);
 674         }
 675 
 676         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 677             assert verify(asm, size, dst, null);
 678             int insnPos = asm.position();
 679             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 680             asm.emitModRM(ext, dst);
 681             int immPos = asm.position();
 682             emitImmediate(asm, size, imm);
 683             int nextInsnPos = asm.position();
 684             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 685                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 686             }
 687         }
 688 
 689         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 690             emit(asm, size, dst, imm, false);
 691         }
 692 
 693         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 694             assert verify(asm, size, null, null);
 695             int insnPos = asm.position();
 696             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 697             asm.emitOperandHelper(ext, dst, immediateSize(size));
 698             int immPos = asm.position();
 699             emitImmediate(asm, size, imm);
 700             int nextInsnPos = asm.position();
 701             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 702                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 703             }
 704         }
 705     }
 706 
 707     /**
 708      * Opcodes with operand order of RMI.
 709      *
 710      * We only have one form of round as the operation is always treated with single variant input,
 711      * making its extension to 3 address forms redundant.
 712      */
 713     public static class AMD64RMIOp extends AMD64ImmOp {
 714         // @formatter:off
 715         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 716         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 717         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 718         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         // @formatter:on
 720 
 721         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 722             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 723         }
 724 
 725         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 726             super(opcode, immIsByte, prefix, op, assertion, feature);
 727         }
 728 
 729         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 730             assert verify(asm, size, dst, src);
 731             if (isSSEInstruction()) {
 732                 Register nds = Register.None;
 733                 switch (op) {
 734                     case 0x0A:
 735                     case 0x0B:
 736                         nds = dst;
 737                         break;
 738                     default:
 739                         break;
 740                 }
 741                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 742                 asm.emitByte(op);
 743                 asm.emitModRM(dst, src);
 744             } else {
 745                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 746                 asm.emitModRM(dst, src);
 747             }
 748             emitImmediate(asm, size, imm);
 749         }
 750 
 751         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 752             assert verify(asm, size, dst, null);
 753             if (isSSEInstruction()) {
 754                 Register nds = Register.None;
 755                 switch (op) {
 756                     case 0x0A:
 757                     case 0x0B:
 758                         nds = dst;
 759                         break;
 760                     default:
 761                         break;
 762                 }
 763                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 764                 asm.emitByte(op);
 765             } else {
 766                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 767             }
 768             asm.emitOperandHelper(dst, src, immediateSize(size));
 769             emitImmediate(asm, size, imm);
 770         }
 771     }
 772 
 773     public static class SSEOp extends AMD64RMOp {
 774         // @formatter:off
 775         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 776         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 778         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 780         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 781         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 782         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 786         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 787         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 788         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 789         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 790         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 791         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 792         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 793         // @formatter:on
 794 
 795         protected SSEOp(String opcode, int prefix, int op) {
 796             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 797         }
 798 
 799         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 800             this(opcode, 0, prefix, op, assertion);
 801         }
 802 
 803         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 804             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 805         }
 806     }
 807 
 808     /**
 809      * Arithmetic operation with operand order of RM, MR or MI.
 810      */
 811     public static final class AMD64BinaryArithmetic {
 812         // @formatter:off
 813         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 814         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 815         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 816         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 817         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 818         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 819         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 820         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 821         // @formatter:on
 822 
 823         private final AMD64MIOp byteImmOp;
 824         private final AMD64MROp byteMrOp;
 825         private final AMD64RMOp byteRmOp;
 826 
 827         private final AMD64MIOp immOp;
 828         private final AMD64MIOp immSxOp;
 829         private final AMD64MROp mrOp;
 830         private final AMD64RMOp rmOp;
 831 
 832         private AMD64BinaryArithmetic(String opcode, int code) {
 833             int baseOp = code << 3;
 834 
 835             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 836             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 837             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 838 
 839             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 840             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 841             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 842             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 843         }
 844 
 845         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 846             if (size == BYTE) {
 847                 return byteImmOp;
 848             } else if (sx) {
 849                 return immSxOp;
 850             } else {
 851                 return immOp;
 852             }
 853         }
 854 
 855         public AMD64MROp getMROpcode(OperandSize size) {
 856             if (size == BYTE) {
 857                 return byteMrOp;
 858             } else {
 859                 return mrOp;
 860             }
 861         }
 862 
 863         public AMD64RMOp getRMOpcode(OperandSize size) {
 864             if (size == BYTE) {
 865                 return byteRmOp;
 866             } else {
 867                 return rmOp;
 868             }
 869         }
 870     }
 871 
 872     /**
 873      * Shift operation with operand order of M1, MC or MI.
 874      */
 875     public static final class AMD64Shift {
 876         // @formatter:off
 877         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 878         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 879         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 880         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 881         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 882         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 883         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 884         // @formatter:on
 885 
 886         public final AMD64MOp m1Op;
 887         public final AMD64MOp mcOp;
 888         public final AMD64MIOp miOp;
 889 
 890         private AMD64Shift(String opcode, int code) {
 891             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 892             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 893             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 894         }
 895     }
 896 
 897     private enum VEXOpAssertion {
 898         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 899         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 900         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 901         AVX1_128ONLY(CPUFeature.AVX, null),
 902         AVX1_256ONLY(null, CPUFeature.AVX),
 903         AVX2_256ONLY(null, CPUFeature.AVX2),
 904         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 905         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 906         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 907         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 908         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 909         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null),
 910         FMA(CPUFeature.FMA, null, XMM, XMM, XMM, null);
 911 
 912         private final CPUFeature l128feature;
 913         private final CPUFeature l256feature;
 914 
 915         private final RegisterCategory rCategory;
 916         private final RegisterCategory vCategory;
 917         private final RegisterCategory mCategory;
 918         private final RegisterCategory imm8Category;
 919 
 920         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 921             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 922         }
 923 
 924         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 925             this.l128feature = l128feature;
 926             this.l256feature = l256feature;
 927             this.rCategory = rCategory;
 928             this.vCategory = vCategory;
 929             this.mCategory = mCategory;
 930             this.imm8Category = imm8Category;
 931         }
 932 
 933         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 934             return check(arch, getLFlag(size), r, v, m, null);
 935         }
 936 
 937         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 938             return check(arch, getLFlag(size), r, v, m, imm8);
 939         }
 940 
 941         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 942             switch (l) {
 943                 case L128:
 944                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 945                     break;
 946                 case L256:
 947                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 948                     break;
 949             }
 950             if (r != null) {
 951                 assert r.getRegisterCategory().equals(rCategory);
 952             }
 953             if (v != null) {
 954                 assert v.getRegisterCategory().equals(vCategory);
 955             }
 956             if (m != null) {
 957                 assert m.getRegisterCategory().equals(mCategory);
 958             }
 959             if (imm8 != null) {
 960                 assert imm8.getRegisterCategory().equals(imm8Category);
 961             }
 962             return true;
 963         }
 964 
 965         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 966             switch (avxSize) {
 967                 case XMM:
 968                     return l128feature != null && features.contains(l128feature);
 969                 case YMM:
 970                     return l256feature != null && features.contains(l256feature);
 971                 default:
 972                     throw GraalError.shouldNotReachHere();
 973             }
 974         }
 975     }
 976 
 977     /**
 978      * Base class for VEX-encoded instructions.
 979      */
 980     public static class VexOp {
 981         protected final int pp;
 982         protected final int mmmmm;
 983         protected final int w;
 984         protected final int op;
 985 
 986         private final String opcode;
 987         protected final VEXOpAssertion assertion;
 988 
 989         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 990             this.pp = pp;
 991             this.mmmmm = mmmmm;
 992             this.w = w;
 993             this.op = op;
 994             this.opcode = opcode;
 995             this.assertion = assertion;
 996         }
 997 
 998         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 999             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
1000         }
1001 
1002         @Override
1003         public String toString() {
1004             return opcode;
1005         }
1006     }
1007 
1008     /**
1009      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1010      */
1011     public static class VexRROp extends VexOp {
1012         // @formatter:off
1013         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1014         // @formatter:on
1015 
1016         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1017             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1018         }
1019 
1020         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1021             super(opcode, pp, mmmmm, w, op, assertion);
1022         }
1023 
1024         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1025             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1026             assert op != 0x1A || op != 0x5A;
1027             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1028             asm.emitByte(op);
1029             asm.emitModRM(dst, src);
1030         }
1031     }
1032 
1033     /**
1034      * VEX-encoded instructions with an operand order of RM.
1035      */
1036     public static class VexRMOp extends VexRROp {
1037         // @formatter:off
1038         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1042         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1044         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1046         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1048         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1049         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1051         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1052         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1056         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1057         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1058         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1059         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1060         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1061         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1062         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1063         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1064         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1065         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1066         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1067         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1068         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1069         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1070         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1074         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1075         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1076         // @formatter:on
1077 
1078         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1079             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1080         }
1081 
1082         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1083             super(opcode, pp, mmmmm, w, op, assertion);
1084         }
1085 
1086         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1087             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1088             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1089             asm.emitByte(op);
1090             asm.emitOperandHelper(dst, src, 0);
1091         }
1092     }
1093 
1094     /**
1095      * VEX-encoded move instructions.
1096      * <p>
1097      * These instructions have two opcodes: op is the forward move instruction with an operand order
1098      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1099      */
1100     public static final class VexMoveOp extends VexRMOp {
1101         // @formatter:off
1102         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1104         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1106         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1110         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1112         // @formatter:on
1113 
1114         private final int opReverse;
1115 
1116         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1117             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1118         }
1119 
1120         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1121             super(opcode, pp, mmmmm, w, op, assertion);
1122             this.opReverse = opReverse;
1123         }
1124 
1125         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1126             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1127             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1128             asm.emitByte(opReverse);
1129             asm.emitOperandHelper(src, dst, 0);
1130         }
1131 
1132         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1133             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1134             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1135             asm.emitByte(opReverse);
1136             asm.emitModRM(src, dst);
1137         }
1138     }
1139 
1140     public interface VexRRIOp {
1141         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1142     }
1143 
1144     /**
1145      * VEX-encoded instructions with an operand order of RMI.
1146      */
1147     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1148         // @formatter:off
1149         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1150         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1153         // @formatter:on
1154 
1155         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1156             super(opcode, pp, mmmmm, w, op, assertion);
1157         }
1158 
1159         @Override
1160         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1161             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1162             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1163             asm.emitByte(op);
1164             asm.emitModRM(dst, src);
1165             asm.emitByte(imm8);
1166         }
1167 
1168         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1169             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1170             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1171             asm.emitByte(op);
1172             asm.emitOperandHelper(dst, src, 1);
1173             asm.emitByte(imm8);
1174         }
1175     }
1176 
1177     /**
1178      * VEX-encoded instructions with an operand order of MRI.
1179      */
1180     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1181         // @formatter:off
1182         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1183         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1184         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1187         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1188         // @formatter:on
1189 
1190         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1191             super(opcode, pp, mmmmm, w, op, assertion);
1192         }
1193 
1194         @Override
1195         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1196             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1197             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1198             asm.emitByte(op);
1199             asm.emitModRM(src, dst);
1200             asm.emitByte(imm8);
1201         }
1202 
1203         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1204             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1205             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1206             asm.emitByte(op);
1207             asm.emitOperandHelper(src, dst, 1);
1208             asm.emitByte(imm8);
1209         }
1210     }
1211 
1212     /**
1213      * VEX-encoded instructions with an operand order of RVMR.
1214      */
1215     public static class VexRVMROp extends VexOp {
1216         // @formatter:off
1217         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1218         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1219         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1220         // @formatter:on
1221 
1222         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1223             super(opcode, pp, mmmmm, w, op, assertion);
1224         }
1225 
1226         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1227             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1228             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1229             asm.emitByte(op);
1230             asm.emitModRM(dst, src2);
1231             asm.emitByte(mask.encoding() << 4);
1232         }
1233 
1234         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1235             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1236             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1237             asm.emitByte(op);
1238             asm.emitOperandHelper(dst, src2, 0);
1239             asm.emitByte(mask.encoding() << 4);
1240         }
1241     }
1242 
1243     /**
1244      * VEX-encoded instructions with an operand order of RVM.
1245      */
1246     public static class VexRVMOp extends VexOp {
1247         // @formatter:off
1248         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1250         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1252         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1254         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1256         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1260         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1264         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1268         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1272         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1276         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1280         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1282         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1298         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1300         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1304         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1311         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1312         public static final VexRVMOp VFMADD231SS = new VexRVMOp("VFMADD231SS", P_66, M_0F38, W0, 0xB9, VEXOpAssertion.FMA);
1313         public static final VexRVMOp VFMADD231SD = new VexRVMOp("VFMADD231SD", P_66, M_0F38, W1, 0xB9, VEXOpAssertion.FMA);
1314         // @formatter:on
1315 
1316         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1317             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1318         }
1319 
1320         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1321             super(opcode, pp, mmmmm, w, op, assertion);
1322         }
1323 
1324         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1325             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1326             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1327             asm.emitByte(op);
1328             asm.emitModRM(dst, src2);
1329         }
1330 
1331         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1332             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1333             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1334             asm.emitByte(op);
1335             asm.emitOperandHelper(dst, src2, 0);
1336         }
1337     }
1338 
1339     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1340         // @formatter:off
1341         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1342         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1343         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1344         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1345         // @formatter:on
1346 
1347         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1348             super(opcode, pp, mmmmm, w, op, assertion);
1349         }
1350 
1351         @Override
1352         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1353             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1354             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1355             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1356             asm.emitByte(op);
1357             asm.emitModRM(dst, src2);
1358         }
1359 
1360         @Override
1361         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1362             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1363             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1364             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1365             asm.emitByte(op);
1366             asm.emitOperandHelper(dst, src2, 0);
1367         }
1368     }
1369 
1370     public static final class VexGeneralPurposeRMVOp extends VexOp {
1371         // @formatter:off
1372         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1373         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1374         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1375         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1376         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1377         // @formatter:on
1378 
1379         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1380             super(opcode, pp, mmmmm, w, op, assertion);
1381         }
1382 
1383         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1384             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1385             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1386             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1387             asm.emitByte(op);
1388             asm.emitModRM(dst, src1);
1389         }
1390 
1391         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1392             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1393             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1394             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1395             asm.emitByte(op);
1396             asm.emitOperandHelper(dst, src1, 0);
1397         }
1398     }
1399 
1400     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1401         // @formatter:off
1402         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1403         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1404         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1405         // @formatter:on
1406         private final int ext;
1407 
1408         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1409             super(opcode, pp, mmmmm, w, op, assertion);
1410             this.ext = ext;
1411         }
1412 
1413         @Override
1414         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1415             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1416             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1417             asm.emitByte(op);
1418             asm.emitModRM(ext, src);
1419         }
1420 
1421         @Override
1422         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1423             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1424             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1425             asm.emitByte(op);
1426             asm.emitOperandHelper(ext, src, 0);
1427         }
1428     }
1429 
1430     /**
1431      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1432      */
1433     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1434         // @formatter:off
1435         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1436         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1437         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1438         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1439         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1440         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1441         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1442         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1443         // @formatter:on
1444 
1445         private final int immOp;
1446         private final int r;
1447 
1448         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1449             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1450             this.immOp = immOp;
1451             this.r = r;
1452         }
1453 
1454         @Override
1455         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1456             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1457             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1458             asm.emitByte(immOp);
1459             asm.emitModRM(r, src);
1460             asm.emitByte(imm8);
1461         }
1462     }
1463 
1464     public static final class VexMaskMoveOp extends VexOp {
1465         // @formatter:off
1466         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1467         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1468         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1469         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1470         // @formatter:on
1471 
1472         private final int opReverse;
1473 
1474         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1475             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1476         }
1477 
1478         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1479             super(opcode, pp, mmmmm, w, op, assertion);
1480             this.opReverse = opReverse;
1481         }
1482 
1483         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1484             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1485             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1486             asm.emitByte(op);
1487             asm.emitOperandHelper(dst, src, 0);
1488         }
1489 
1490         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1491             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1492             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1493             asm.emitByte(opReverse);
1494             asm.emitOperandHelper(src, dst, 0);
1495         }
1496     }
1497 
1498     /**
1499      * VEX-encoded instructions with an operand order of RVMI.
1500      */
1501     public static final class VexRVMIOp extends VexOp {
1502         // @formatter:off
1503         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1504         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1505         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1506         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1507         // @formatter:on
1508 
1509         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1510             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1511         }
1512 
1513         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1514             super(opcode, pp, mmmmm, w, op, assertion);
1515         }
1516 
1517         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1518             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1519             assert (imm8 & 0xFF) == imm8;
1520             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1521             asm.emitByte(op);
1522             asm.emitModRM(dst, src2);
1523             asm.emitByte(imm8);
1524         }
1525 
1526         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1527             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1528             assert (imm8 & 0xFF) == imm8;
1529             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1530             asm.emitByte(op);
1531             asm.emitOperandHelper(dst, src2, 1);
1532             asm.emitByte(imm8);
1533         }
1534     }
1535 
1536     /**
1537      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1538      * comparison operator.
1539      */
1540     public static final class VexFloatCompareOp extends VexOp {
1541         // @formatter:off
1542         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1543         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1544         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1545         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1546         // @formatter:on
1547 
1548         public enum Predicate {
1549             EQ_OQ(0x00),
1550             LT_OS(0x01),
1551             LE_OS(0x02),
1552             UNORD_Q(0x03),
1553             NEQ_UQ(0x04),
1554             NLT_US(0x05),
1555             NLE_US(0x06),
1556             ORD_Q(0x07),
1557             EQ_UQ(0x08),
1558             NGE_US(0x09),
1559             NGT_US(0x0a),
1560             FALSE_OQ(0x0b),
1561             NEQ_OQ(0x0c),
1562             GE_OS(0x0d),
1563             GT_OS(0x0e),
1564             TRUE_UQ(0x0f),
1565             EQ_OS(0x10),
1566             LT_OQ(0x11),
1567             LE_OQ(0x12),
1568             UNORD_S(0x13),
1569             NEQ_US(0x14),
1570             NLT_UQ(0x15),
1571             NLE_UQ(0x16),
1572             ORD_S(0x17),
1573             EQ_US(0x18),
1574             NGE_UQ(0x19),
1575             NGT_UQ(0x1a),
1576             FALSE_OS(0x1b),
1577             NEQ_OS(0x1c),
1578             GE_OQ(0x1d),
1579             GT_OQ(0x1e),
1580             TRUE_US(0x1f);
1581 
1582             private int imm8;
1583 
1584             Predicate(int imm8) {
1585                 this.imm8 = imm8;
1586             }
1587 
1588             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1589                 if (unorderedIsTrue) {
1590                     switch (condition) {
1591                         case EQ:
1592                             return EQ_UQ;
1593                         case NE:
1594                             return NEQ_UQ;
1595                         case LT:
1596                             return NGE_UQ;
1597                         case LE:
1598                             return NGT_UQ;
1599                         case GT:
1600                             return NLE_UQ;
1601                         case GE:
1602                             return NLT_UQ;
1603                         default:
1604                             throw GraalError.shouldNotReachHere();
1605                     }
1606                 } else {
1607                     switch (condition) {
1608                         case EQ:
1609                             return EQ_OQ;
1610                         case NE:
1611                             return NEQ_OQ;
1612                         case LT:
1613                             return LT_OQ;
1614                         case LE:
1615                             return LE_OQ;
1616                         case GT:
1617                             return GT_OQ;
1618                         case GE:
1619                             return GE_OQ;
1620                         default:
1621                             throw GraalError.shouldNotReachHere();
1622                     }
1623                 }
1624             }
1625         }
1626 
1627         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1628             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1629         }
1630 
1631         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1632             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1633             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1634             asm.emitByte(op);
1635             asm.emitModRM(dst, src2);
1636             asm.emitByte(p.imm8);
1637         }
1638 
1639         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1640             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1641             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1642             asm.emitByte(op);
1643             asm.emitOperandHelper(dst, src2, 1);
1644             asm.emitByte(p.imm8);
1645         }
1646     }
1647 
1648     public final void addl(AMD64Address dst, int imm32) {
1649         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1650     }
1651 
1652     public final void addl(Register dst, int imm32) {
1653         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1654     }
1655 
1656     public final void addl(Register dst, Register src) {
1657         ADD.rmOp.emit(this, DWORD, dst, src);
1658     }
1659 
1660     public final void addpd(Register dst, Register src) {
1661         SSEOp.ADD.emit(this, PD, dst, src);
1662     }
1663 
1664     public final void addpd(Register dst, AMD64Address src) {
1665         SSEOp.ADD.emit(this, PD, dst, src);
1666     }
1667 
1668     public final void addsd(Register dst, Register src) {
1669         SSEOp.ADD.emit(this, SD, dst, src);
1670     }
1671 
1672     public final void addsd(Register dst, AMD64Address src) {
1673         SSEOp.ADD.emit(this, SD, dst, src);
1674     }
1675 
1676     private void addrNop4() {
1677         // 4 bytes: NOP DWORD PTR [EAX+0]
1678         emitByte(0x0F);
1679         emitByte(0x1F);
1680         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1681         emitByte(0); // 8-bits offset (1 byte)
1682     }
1683 
1684     private void addrNop5() {
1685         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1686         emitByte(0x0F);
1687         emitByte(0x1F);
1688         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1689         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1690         emitByte(0); // 8-bits offset (1 byte)
1691     }
1692 
1693     private void addrNop7() {
1694         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1695         emitByte(0x0F);
1696         emitByte(0x1F);
1697         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1698         emitInt(0); // 32-bits offset (4 bytes)
1699     }
1700 
1701     private void addrNop8() {
1702         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1703         emitByte(0x0F);
1704         emitByte(0x1F);
1705         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1706         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1707         emitInt(0); // 32-bits offset (4 bytes)
1708     }
1709 
1710     public final void andl(Register dst, int imm32) {
1711         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1712     }
1713 
1714     public final void andl(Register dst, Register src) {
1715         AND.rmOp.emit(this, DWORD, dst, src);
1716     }
1717 
1718     public final void andpd(Register dst, Register src) {
1719         SSEOp.AND.emit(this, PD, dst, src);
1720     }
1721 
1722     public final void andpd(Register dst, AMD64Address src) {
1723         SSEOp.AND.emit(this, PD, dst, src);
1724     }
1725 
1726     public final void bsfq(Register dst, Register src) {
1727         prefixq(dst, src);
1728         emitByte(0x0F);
1729         emitByte(0xBC);
1730         emitModRM(dst, src);
1731     }
1732 
1733     public final void bsrl(Register dst, Register src) {
1734         prefix(dst, src);
1735         emitByte(0x0F);
1736         emitByte(0xBD);
1737         emitModRM(dst, src);
1738     }
1739 
1740     public final void bswapl(Register reg) {
1741         prefix(reg);
1742         emitByte(0x0F);
1743         emitModRM(1, reg);
1744     }
1745 
1746     public final void cdql() {
1747         emitByte(0x99);
1748     }
1749 
1750     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1751         prefix(dst, src);
1752         emitByte(0x0F);
1753         emitByte(0x40 | cc.getValue());
1754         emitModRM(dst, src);
1755     }
1756 
1757     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1758         prefix(src, dst);
1759         emitByte(0x0F);
1760         emitByte(0x40 | cc.getValue());
1761         emitOperandHelper(dst, src, 0);
1762     }
1763 
1764     public final void cmpb(Register dst, Register src) {
1765         CMP.byteRmOp.emit(this, BYTE, dst, src);
1766     }
1767 
1768     public final void cmpw(Register dst, Register src) {
1769         CMP.rmOp.emit(this, WORD, dst, src);
1770     }
1771 
1772     public final void cmpl(Register dst, int imm32) {
1773         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1774     }
1775 
1776     public final void cmpl(Register dst, Register src) {
1777         CMP.rmOp.emit(this, DWORD, dst, src);
1778     }
1779 
1780     public final void cmpl(Register dst, AMD64Address src) {
1781         CMP.rmOp.emit(this, DWORD, dst, src);
1782     }
1783 
1784     public final void cmpl(AMD64Address dst, int imm32) {
1785         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1786     }
1787 
1788     /**
1789      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1790      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1791      * values were equal, and cleared otherwise.
1792      */
1793     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1794         prefixb(adr, reg);
1795         emitByte(0x0F);
1796         emitByte(0xB0);
1797         emitOperandHelper(reg, adr, 0);
1798     }
1799 
1800     /**
1801      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1802      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1803      * compared values were equal, and cleared otherwise.
1804      */
1805     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1806         emitByte(0x66); // Switch to 16-bit mode.
1807         prefix(adr, reg);
1808         emitByte(0x0F);
1809         emitByte(0xB1);
1810         emitOperandHelper(reg, adr, 0);
1811     }
1812 
1813     /**
1814      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1815      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1816      * compared values were equal, and cleared otherwise.
1817      */
1818     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1819         prefix(adr, reg);
1820         emitByte(0x0F);
1821         emitByte(0xB1);
1822         emitOperandHelper(reg, adr, 0);
1823     }
1824 
1825     public final void cvtsi2sdl(Register dst, Register src) {
1826         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1827     }
1828 
1829     public final void cvttsd2sil(Register dst, Register src) {
1830         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1831     }
1832 
1833     public final void decl(AMD64Address dst) {
1834         prefix(dst);
1835         emitByte(0xFF);
1836         emitOperandHelper(1, dst, 0);
1837     }
1838 
1839     public final void divsd(Register dst, Register src) {
1840         SSEOp.DIV.emit(this, SD, dst, src);
1841     }
1842 
1843     public final void hlt() {
1844         emitByte(0xF4);
1845     }
1846 
1847     public final void imull(Register dst, Register src, int value) {
1848         if (isByte(value)) {
1849             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1850         } else {
1851             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1852         }
1853     }
1854 
1855     public final void incl(AMD64Address dst) {
1856         prefix(dst);
1857         emitByte(0xFF);
1858         emitOperandHelper(0, dst, 0);
1859     }
1860 
1861     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1862         int shortSize = 2;
1863         int longSize = 6;
1864         long disp = jumpTarget - position();
1865         if (!forceDisp32 && isByte(disp - shortSize)) {
1866             // 0111 tttn #8-bit disp
1867             emitByte(0x70 | cc.getValue());
1868             emitByte((int) ((disp - shortSize) & 0xFF));
1869         } else {
1870             // 0000 1111 1000 tttn #32-bit disp
1871             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1872             emitByte(0x0F);
1873             emitByte(0x80 | cc.getValue());
1874             emitInt((int) (disp - longSize));
1875         }
1876     }
1877 
1878     public final void jcc(ConditionFlag cc, Label l) {
1879         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1880         if (l.isBound()) {
1881             jcc(cc, l.position(), false);
1882         } else {
1883             // Note: could eliminate cond. jumps to this jump if condition
1884             // is the same however, seems to be rather unlikely case.
1885             // Note: use jccb() if label to be bound is very close to get
1886             // an 8-bit displacement
1887             l.addPatchAt(position(), this);
1888             emitByte(0x0F);
1889             emitByte(0x80 | cc.getValue());
1890             emitInt(0);
1891         }
1892 
1893     }
1894 
1895     public final void jccb(ConditionFlag cc, Label l) {
1896         if (l.isBound()) {
1897             int shortSize = 2;
1898             int entry = l.position();
1899             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1900             long disp = entry - position();
1901             // 0111 tttn #8-bit disp
1902             emitByte(0x70 | cc.getValue());
1903             emitByte((int) ((disp - shortSize) & 0xFF));
1904         } else {
1905             l.addPatchAt(position(), this);
1906             emitByte(0x70 | cc.getValue());
1907             emitByte(0);
1908         }
1909     }
1910 
1911     public final void jmp(int jumpTarget, boolean forceDisp32) {
1912         int shortSize = 2;
1913         int longSize = 5;
1914         long disp = jumpTarget - position();
1915         if (!forceDisp32 && isByte(disp - shortSize)) {
1916             emitByte(0xEB);
1917             emitByte((int) ((disp - shortSize) & 0xFF));
1918         } else {
1919             emitByte(0xE9);
1920             emitInt((int) (disp - longSize));
1921         }
1922     }
1923 
1924     @Override
1925     public final void jmp(Label l) {
1926         if (l.isBound()) {
1927             jmp(l.position(), false);
1928         } else {
1929             // By default, forward jumps are always 32-bit displacements, since
1930             // we can't yet know where the label will be bound. If you're sure that
1931             // the forward jump will not run beyond 256 bytes, use jmpb to
1932             // force an 8-bit displacement.
1933 
1934             l.addPatchAt(position(), this);
1935             emitByte(0xE9);
1936             emitInt(0);
1937         }
1938     }
1939 
1940     public final void jmp(Register entry) {
1941         prefix(entry);
1942         emitByte(0xFF);
1943         emitModRM(4, entry);
1944     }
1945 
1946     public final void jmp(AMD64Address adr) {
1947         prefix(adr);
1948         emitByte(0xFF);
1949         emitOperandHelper(AMD64.rsp, adr, 0);
1950     }
1951 
1952     public final void jmpb(Label l) {
1953         if (l.isBound()) {
1954             int shortSize = 2;
1955             // Displacement is relative to byte just after jmpb instruction
1956             int displacement = l.position() - position() - shortSize;
1957             GraalError.guarantee(isByte(displacement), "Displacement too large to be encoded as a byte: %d", displacement);
1958             emitByte(0xEB);
1959             emitByte(displacement & 0xFF);
1960         } else {
1961             l.addPatchAt(position(), this);
1962             emitByte(0xEB);
1963             emitByte(0);
1964         }
1965     }
1966 
1967     public final void lead(Register dst, AMD64Address src) {
1968         prefix(src, dst);
1969         emitByte(0x8D);
1970         emitOperandHelper(dst, src, 0);
1971     }
1972 
1973     public final void leaq(Register dst, AMD64Address src) {
1974         prefixq(src, dst);
1975         emitByte(0x8D);
1976         emitOperandHelper(dst, src, 0);
1977     }
1978 
1979     public final void leave() {
1980         emitByte(0xC9);
1981     }
1982 
1983     public final void lock() {
1984         emitByte(0xF0);
1985     }
1986 
1987     public final void movapd(Register dst, Register src) {
1988         assert inRC(XMM, dst) && inRC(XMM, src);
1989         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1990         emitByte(0x28);
1991         emitModRM(dst, src);
1992     }
1993 
1994     public final void movaps(Register dst, Register src) {
1995         assert inRC(XMM, dst) && inRC(XMM, src);
1996         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1997         emitByte(0x28);
1998         emitModRM(dst, src);
1999     }
2000 
2001     public final void movb(AMD64Address dst, int imm8) {
2002         prefix(dst);
2003         emitByte(0xC6);
2004         emitOperandHelper(0, dst, 1);
2005         emitByte(imm8);
2006     }
2007 
2008     public final void movb(AMD64Address dst, Register src) {
2009         assert inRC(CPU, src) : "must have byte register";
2010         prefixb(dst, src);
2011         emitByte(0x88);
2012         emitOperandHelper(src, dst, 0);
2013     }
2014 
2015     public final void movl(Register dst, int imm32) {
2016         movl(dst, imm32, false);
2017     }
2018 
2019     public final void movl(Register dst, int imm32, boolean annotateImm) {
2020         int insnPos = position();
2021         prefix(dst);
2022         emitByte(0xB8 + encode(dst));
2023         int immPos = position();
2024         emitInt(imm32);
2025         int nextInsnPos = position();
2026         if (annotateImm && codePatchingAnnotationConsumer != null) {
2027             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2028         }
2029     }
2030 
2031     public final void movl(Register dst, Register src) {
2032         prefix(dst, src);
2033         emitByte(0x8B);
2034         emitModRM(dst, src);
2035     }
2036 
2037     public final void movl(Register dst, AMD64Address src) {
2038         prefix(src, dst);
2039         emitByte(0x8B);
2040         emitOperandHelper(dst, src, 0);
2041     }
2042 
2043     /**
2044      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2045      */
2046     public final void movl(Register dst, AMD64Address src, boolean wide) {
2047         prefix(src, dst);
2048         emitByte(0x8B);
2049         emitOperandHelper(dst, src, wide, 0);
2050     }
2051 
2052     public final void movl(AMD64Address dst, int imm32) {
2053         prefix(dst);
2054         emitByte(0xC7);
2055         emitOperandHelper(0, dst, 4);
2056         emitInt(imm32);
2057     }
2058 
2059     public final void movl(AMD64Address dst, Register src) {
2060         prefix(dst, src);
2061         emitByte(0x89);
2062         emitOperandHelper(src, dst, 0);
2063     }
2064 
2065     /**
2066      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2067      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2068      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2069      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2070      */
2071     public final void movlpd(Register dst, AMD64Address src) {
2072         assert inRC(XMM, dst);
2073         simdPrefix(dst, dst, src, PD, P_0F, false);
2074         emitByte(0x12);
2075         emitOperandHelper(dst, src, 0);
2076     }
2077 
2078     public final void movlhps(Register dst, Register src) {
2079         assert inRC(XMM, dst) && inRC(XMM, src);
2080         simdPrefix(dst, src, src, PS, P_0F, false);
2081         emitByte(0x16);
2082         emitModRM(dst, src);
2083     }
2084 
2085     public final void movq(Register dst, AMD64Address src) {
2086         movq(dst, src, false);
2087     }
2088 
2089     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2090         if (inRC(XMM, dst)) {
2091             // Insn: MOVQ xmm, r/m64
2092             // Code: F3 0F 7E /r
2093             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2094             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2095             // when applicable.
2096             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2097             emitByte(0x7E);
2098             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2099         } else {
2100             // gpr version of movq
2101             prefixq(src, dst);
2102             emitByte(0x8B);
2103             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2104         }
2105     }
2106 
2107     public final void movq(Register dst, Register src) {
2108         assert inRC(CPU, dst) && inRC(CPU, src);
2109         prefixq(dst, src);
2110         emitByte(0x8B);
2111         emitModRM(dst, src);
2112     }
2113 
2114     public final void movq(AMD64Address dst, Register src) {
2115         if (inRC(XMM, src)) {
2116             // Insn: MOVQ r/m64, xmm
2117             // Code: 66 0F D6 /r
2118             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2119             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2120             // when applicable.
2121             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2122             emitByte(0xD6);
2123             emitOperandHelper(src, dst, 0);
2124         } else {
2125             // gpr version of movq
2126             prefixq(dst, src);
2127             emitByte(0x89);
2128             emitOperandHelper(src, dst, 0);
2129         }
2130     }
2131 
2132     public final void movsbl(Register dst, AMD64Address src) {
2133         prefix(src, dst);
2134         emitByte(0x0F);
2135         emitByte(0xBE);
2136         emitOperandHelper(dst, src, 0);
2137     }
2138 
2139     public final void movsbl(Register dst, Register src) {
2140         prefix(dst, false, src, true);
2141         emitByte(0x0F);
2142         emitByte(0xBE);
2143         emitModRM(dst, src);
2144     }
2145 
2146     public final void movsbq(Register dst, AMD64Address src) {
2147         prefixq(src, dst);
2148         emitByte(0x0F);
2149         emitByte(0xBE);
2150         emitOperandHelper(dst, src, 0);
2151     }
2152 
2153     public final void movsbq(Register dst, Register src) {
2154         prefixq(dst, src);
2155         emitByte(0x0F);
2156         emitByte(0xBE);
2157         emitModRM(dst, src);
2158     }
2159 
2160     public final void movsd(Register dst, Register src) {
2161         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2162     }
2163 
2164     public final void movsd(Register dst, AMD64Address src) {
2165         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2166     }
2167 
2168     public final void movsd(AMD64Address dst, Register src) {
2169         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2170     }
2171 
2172     public final void movss(Register dst, Register src) {
2173         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2174     }
2175 
2176     public final void movss(Register dst, AMD64Address src) {
2177         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2178     }
2179 
2180     public final void movss(AMD64Address dst, Register src) {
2181         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2182     }
2183 
2184     public final void mulpd(Register dst, Register src) {
2185         SSEOp.MUL.emit(this, PD, dst, src);
2186     }
2187 
2188     public final void mulpd(Register dst, AMD64Address src) {
2189         SSEOp.MUL.emit(this, PD, dst, src);
2190     }
2191 
2192     public final void mulsd(Register dst, Register src) {
2193         SSEOp.MUL.emit(this, SD, dst, src);
2194     }
2195 
2196     public final void mulsd(Register dst, AMD64Address src) {
2197         SSEOp.MUL.emit(this, SD, dst, src);
2198     }
2199 
2200     public final void mulss(Register dst, Register src) {
2201         SSEOp.MUL.emit(this, SS, dst, src);
2202     }
2203 
2204     public final void movswl(Register dst, AMD64Address src) {
2205         AMD64RMOp.MOVSX.emit(this, DWORD, dst, src);
2206     }
2207 
2208     public final void movswq(Register dst, AMD64Address src) {
2209         AMD64RMOp.MOVSX.emit(this, QWORD, dst, src);
2210     }
2211 
2212     public final void movw(AMD64Address dst, int imm16) {
2213         emitByte(0x66); // switch to 16-bit mode
2214         prefix(dst);
2215         emitByte(0xC7);
2216         emitOperandHelper(0, dst, 2);
2217         emitShort(imm16);
2218     }
2219 
2220     public final void movw(AMD64Address dst, Register src) {
2221         emitByte(0x66);
2222         prefix(dst, src);
2223         emitByte(0x89);
2224         emitOperandHelper(src, dst, 0);
2225     }
2226 
2227     public final void movw(Register dst, AMD64Address src) {
2228         emitByte(0x66);
2229         prefix(src, dst);
2230         emitByte(0x8B);
2231         emitOperandHelper(dst, src, 0);
2232     }
2233 
2234     public final void movzbl(Register dst, AMD64Address src) {
2235         prefix(src, dst);
2236         emitByte(0x0F);
2237         emitByte(0xB6);
2238         emitOperandHelper(dst, src, 0);
2239     }
2240 
2241     public final void movzbl(Register dst, Register src) {
2242         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2243     }
2244 
2245     public final void movzbq(Register dst, Register src) {
2246         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2247     }
2248 
2249     public final void movzbq(Register dst, AMD64Address src) {
2250         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2251     }
2252 
2253     public final void movzwl(Register dst, AMD64Address src) {
2254         AMD64RMOp.MOVZX.emit(this, DWORD, dst, src);
2255     }
2256 
2257     public final void movzwq(Register dst, AMD64Address src) {
2258         AMD64RMOp.MOVZX.emit(this, QWORD, dst, src);
2259     }
2260 
2261     public final void negl(Register dst) {
2262         NEG.emit(this, DWORD, dst);
2263     }
2264 
2265     public final void notl(Register dst) {
2266         NOT.emit(this, DWORD, dst);
2267     }
2268 
2269     public final void notq(Register dst) {
2270         NOT.emit(this, QWORD, dst);
2271     }
2272 
2273     @Override
2274     public final void ensureUniquePC() {
2275         nop();
2276     }
2277 
2278     public final void nop() {
2279         nop(1);
2280     }
2281 
2282     public void nop(int count) {
2283         int i = count;
2284         if (UseNormalNop) {
2285             assert i > 0 : " ";
2286             // The fancy nops aren't currently recognized by debuggers making it a
2287             // pain to disassemble code while debugging. If assert are on clearly
2288             // speed is not an issue so simply use the single byte traditional nop
2289             // to do alignment.
2290 
2291             for (; i > 0; i--) {
2292                 emitByte(0x90);
2293             }
2294             return;
2295         }
2296 
2297         if (UseAddressNop) {
2298             //
2299             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2300             // 1: 0x90
2301             // 2: 0x66 0x90
2302             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2303             // 4: 0x0F 0x1F 0x40 0x00
2304             // 5: 0x0F 0x1F 0x44 0x00 0x00
2305             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2306             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2307             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2308             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2309             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2310             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2311 
2312             // The rest coding is AMD specific - use consecutive Address nops
2313 
2314             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2315             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2316             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2317             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2318             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2319             // Size prefixes (0x66) are added for larger sizes
2320 
2321             while (i >= 22) {
2322                 i -= 11;
2323                 emitByte(0x66); // size prefix
2324                 emitByte(0x66); // size prefix
2325                 emitByte(0x66); // size prefix
2326                 addrNop8();
2327             }
2328             // Generate first nop for size between 21-12
2329             switch (i) {
2330                 case 21:
2331                     i -= 11;
2332                     emitByte(0x66); // size prefix
2333                     emitByte(0x66); // size prefix
2334                     emitByte(0x66); // size prefix
2335                     addrNop8();
2336                     break;
2337                 case 20:
2338                 case 19:
2339                     i -= 10;
2340                     emitByte(0x66); // size prefix
2341                     emitByte(0x66); // size prefix
2342                     addrNop8();
2343                     break;
2344                 case 18:
2345                 case 17:
2346                     i -= 9;
2347                     emitByte(0x66); // size prefix
2348                     addrNop8();
2349                     break;
2350                 case 16:
2351                 case 15:
2352                     i -= 8;
2353                     addrNop8();
2354                     break;
2355                 case 14:
2356                 case 13:
2357                     i -= 7;
2358                     addrNop7();
2359                     break;
2360                 case 12:
2361                     i -= 6;
2362                     emitByte(0x66); // size prefix
2363                     addrNop5();
2364                     break;
2365                 default:
2366                     assert i < 12;
2367             }
2368 
2369             // Generate second nop for size between 11-1
2370             switch (i) {
2371                 case 11:
2372                     emitByte(0x66); // size prefix
2373                     emitByte(0x66); // size prefix
2374                     emitByte(0x66); // size prefix
2375                     addrNop8();
2376                     break;
2377                 case 10:
2378                     emitByte(0x66); // size prefix
2379                     emitByte(0x66); // size prefix
2380                     addrNop8();
2381                     break;
2382                 case 9:
2383                     emitByte(0x66); // size prefix
2384                     addrNop8();
2385                     break;
2386                 case 8:
2387                     addrNop8();
2388                     break;
2389                 case 7:
2390                     addrNop7();
2391                     break;
2392                 case 6:
2393                     emitByte(0x66); // size prefix
2394                     addrNop5();
2395                     break;
2396                 case 5:
2397                     addrNop5();
2398                     break;
2399                 case 4:
2400                     addrNop4();
2401                     break;
2402                 case 3:
2403                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2404                     emitByte(0x66); // size prefix
2405                     emitByte(0x66); // size prefix
2406                     emitByte(0x90); // nop
2407                     break;
2408                 case 2:
2409                     emitByte(0x66); // size prefix
2410                     emitByte(0x90); // nop
2411                     break;
2412                 case 1:
2413                     emitByte(0x90); // nop
2414                     break;
2415                 default:
2416                     assert i == 0;
2417             }
2418             return;
2419         }
2420 
2421         // Using nops with size prefixes "0x66 0x90".
2422         // From AMD Optimization Guide:
2423         // 1: 0x90
2424         // 2: 0x66 0x90
2425         // 3: 0x66 0x66 0x90
2426         // 4: 0x66 0x66 0x66 0x90
2427         // 5: 0x66 0x66 0x90 0x66 0x90
2428         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2429         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2430         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2431         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2432         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2433         //
2434         while (i > 12) {
2435             i -= 4;
2436             emitByte(0x66); // size prefix
2437             emitByte(0x66);
2438             emitByte(0x66);
2439             emitByte(0x90); // nop
2440         }
2441         // 1 - 12 nops
2442         if (i > 8) {
2443             if (i > 9) {
2444                 i -= 1;
2445                 emitByte(0x66);
2446             }
2447             i -= 3;
2448             emitByte(0x66);
2449             emitByte(0x66);
2450             emitByte(0x90);
2451         }
2452         // 1 - 8 nops
2453         if (i > 4) {
2454             if (i > 6) {
2455                 i -= 1;
2456                 emitByte(0x66);
2457             }
2458             i -= 3;
2459             emitByte(0x66);
2460             emitByte(0x66);
2461             emitByte(0x90);
2462         }
2463         switch (i) {
2464             case 4:
2465                 emitByte(0x66);
2466                 emitByte(0x66);
2467                 emitByte(0x66);
2468                 emitByte(0x90);
2469                 break;
2470             case 3:
2471                 emitByte(0x66);
2472                 emitByte(0x66);
2473                 emitByte(0x90);
2474                 break;
2475             case 2:
2476                 emitByte(0x66);
2477                 emitByte(0x90);
2478                 break;
2479             case 1:
2480                 emitByte(0x90);
2481                 break;
2482             default:
2483                 assert i == 0;
2484         }
2485     }
2486 
2487     public final void orl(Register dst, Register src) {
2488         OR.rmOp.emit(this, DWORD, dst, src);
2489     }
2490 
2491     public final void orl(Register dst, int imm32) {
2492         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2493     }
2494 
2495     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2496     // -----
2497     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2498 
2499     public final void packuswb(Register dst, Register src) {
2500         assert inRC(XMM, dst) && inRC(XMM, src);
2501         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2502         simdPrefix(dst, dst, src, PD, P_0F, false);
2503         emitByte(0x67);
2504         emitModRM(dst, src);
2505     }
2506 
2507     public final void pop(Register dst) {
2508         prefix(dst);
2509         emitByte(0x58 + encode(dst));
2510     }
2511 
2512     public void popfq() {
2513         emitByte(0x9D);
2514     }
2515 
2516     public final void ptest(Register dst, Register src) {
2517         assert supports(CPUFeature.SSE4_1);
2518         assert inRC(XMM, dst) && inRC(XMM, src);
2519         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2520         emitByte(0x17);
2521         emitModRM(dst, src);
2522     }
2523 
2524     public final void pcmpeqb(Register dst, Register src) {
2525         assert supports(CPUFeature.SSE2);
2526         assert inRC(XMM, dst) && inRC(XMM, src);
2527         simdPrefix(dst, dst, src, PD, P_0F, false);
2528         emitByte(0x74);
2529         emitModRM(dst, src);
2530     }
2531 
2532     public final void pcmpeqw(Register dst, Register src) {
2533         assert supports(CPUFeature.SSE2);
2534         assert inRC(XMM, dst) && inRC(XMM, src);
2535         simdPrefix(dst, dst, src, PD, P_0F, false);
2536         emitByte(0x75);
2537         emitModRM(dst, src);
2538     }
2539 
2540     public final void pcmpeqd(Register dst, Register src) {
2541         assert supports(CPUFeature.SSE2);
2542         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2543         simdPrefix(dst, dst, src, PD, P_0F, false);
2544         emitByte(0x76);
2545         emitModRM(dst, src);
2546     }
2547 
2548     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2549         assert supports(CPUFeature.SSE4_2);
2550         assert inRC(XMM, dst);
2551         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2552         emitByte(0x61);
2553         emitOperandHelper(dst, src, 0);
2554         emitByte(imm8);
2555     }
2556 
2557     public final void pcmpestri(Register dst, Register src, int imm8) {
2558         assert supports(CPUFeature.SSE4_2);
2559         assert inRC(XMM, dst) && inRC(XMM, src);
2560         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2561         emitByte(0x61);
2562         emitModRM(dst, src);
2563         emitByte(imm8);
2564     }
2565 
2566     public final void pmovmskb(Register dst, Register src) {
2567         assert supports(CPUFeature.SSE2);
2568         assert inRC(CPU, dst) && inRC(XMM, src);
2569         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2570         emitByte(0xD7);
2571         emitModRM(dst, src);
2572     }
2573 
2574     private void pmovSZx(Register dst, AMD64Address src, int op) {
2575         assert supports(CPUFeature.SSE4_1);
2576         assert inRC(XMM, dst);
2577         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2578         emitByte(op);
2579         emitOperandHelper(dst, src, 0);
2580     }
2581 
2582     public final void pmovsxbw(Register dst, AMD64Address src) {
2583         pmovSZx(dst, src, 0x20);
2584     }
2585 
2586     public final void pmovsxbd(Register dst, AMD64Address src) {
2587         pmovSZx(dst, src, 0x21);
2588     }
2589 
2590     public final void pmovsxbq(Register dst, AMD64Address src) {
2591         pmovSZx(dst, src, 0x22);
2592     }
2593 
2594     public final void pmovsxwd(Register dst, AMD64Address src) {
2595         pmovSZx(dst, src, 0x23);
2596     }
2597 
2598     public final void pmovsxwq(Register dst, AMD64Address src) {
2599         pmovSZx(dst, src, 0x24);
2600     }
2601 
2602     public final void pmovsxdq(Register dst, AMD64Address src) {
2603         pmovSZx(dst, src, 0x25);
2604     }
2605 
2606     // Insn: VPMOVZXBW xmm1, xmm2/m64
2607     public final void pmovzxbw(Register dst, AMD64Address src) {
2608         pmovSZx(dst, src, 0x30);
2609     }
2610 
2611     public final void pmovzxbd(Register dst, AMD64Address src) {
2612         pmovSZx(dst, src, 0x31);
2613     }
2614 
2615     public final void pmovzxbq(Register dst, AMD64Address src) {
2616         pmovSZx(dst, src, 0x32);
2617     }
2618 
2619     public final void pmovzxwd(Register dst, AMD64Address src) {
2620         pmovSZx(dst, src, 0x33);
2621     }
2622 
2623     public final void pmovzxwq(Register dst, AMD64Address src) {
2624         pmovSZx(dst, src, 0x34);
2625     }
2626 
2627     public final void pmovzxdq(Register dst, AMD64Address src) {
2628         pmovSZx(dst, src, 0x35);
2629     }
2630 
2631     public final void pmovzxbw(Register dst, Register src) {
2632         assert supports(CPUFeature.SSE4_1);
2633         assert inRC(XMM, dst) && inRC(XMM, src);
2634         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2635         emitByte(0x30);
2636         emitModRM(dst, src);
2637     }
2638 
2639     public final void push(Register src) {
2640         prefix(src);
2641         emitByte(0x50 + encode(src));
2642     }
2643 
2644     public void pushfq() {
2645         emitByte(0x9c);
2646     }
2647 
2648     public final void paddd(Register dst, Register src) {
2649         assert inRC(XMM, dst) && inRC(XMM, src);
2650         simdPrefix(dst, dst, src, PD, P_0F, false);
2651         emitByte(0xFE);
2652         emitModRM(dst, src);
2653     }
2654 
2655     public final void paddq(Register dst, Register src) {
2656         assert inRC(XMM, dst) && inRC(XMM, src);
2657         simdPrefix(dst, dst, src, PD, P_0F, false);
2658         emitByte(0xD4);
2659         emitModRM(dst, src);
2660     }
2661 
2662     public final void pextrw(Register dst, Register src, int imm8) {
2663         assert inRC(CPU, dst) && inRC(XMM, src);
2664         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2665         emitByte(0xC5);
2666         emitModRM(dst, src);
2667         emitByte(imm8);
2668     }
2669 
2670     public final void pinsrw(Register dst, Register src, int imm8) {
2671         assert inRC(XMM, dst) && inRC(CPU, src);
2672         simdPrefix(dst, dst, src, PD, P_0F, false);
2673         emitByte(0xC4);
2674         emitModRM(dst, src);
2675         emitByte(imm8);
2676     }
2677 
2678     public final void por(Register dst, Register src) {
2679         assert inRC(XMM, dst) && inRC(XMM, src);
2680         simdPrefix(dst, dst, src, PD, P_0F, false);
2681         emitByte(0xEB);
2682         emitModRM(dst, src);
2683     }
2684 
2685     public final void pand(Register dst, Register src) {
2686         assert inRC(XMM, dst) && inRC(XMM, src);
2687         simdPrefix(dst, dst, src, PD, P_0F, false);
2688         emitByte(0xDB);
2689         emitModRM(dst, src);
2690     }
2691 
2692     public final void pxor(Register dst, Register src) {
2693         assert inRC(XMM, dst) && inRC(XMM, src);
2694         simdPrefix(dst, dst, src, PD, P_0F, false);
2695         emitByte(0xEF);
2696         emitModRM(dst, src);
2697     }
2698 
2699     public final void pslld(Register dst, int imm8) {
2700         assert isUByte(imm8) : "invalid value";
2701         assert inRC(XMM, dst);
2702         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2703         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2704         emitByte(0x72);
2705         emitModRM(6, dst);
2706         emitByte(imm8 & 0xFF);
2707     }
2708 
2709     public final void psllq(Register dst, Register shift) {
2710         assert inRC(XMM, dst) && inRC(XMM, shift);
2711         simdPrefix(dst, dst, shift, PD, P_0F, false);
2712         emitByte(0xF3);
2713         emitModRM(dst, shift);
2714     }
2715 
2716     public final void psllq(Register dst, int imm8) {
2717         assert isUByte(imm8) : "invalid value";
2718         assert inRC(XMM, dst);
2719         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2720         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2721         emitByte(0x73);
2722         emitModRM(6, dst);
2723         emitByte(imm8);
2724     }
2725 
2726     public final void psrad(Register dst, int imm8) {
2727         assert isUByte(imm8) : "invalid value";
2728         assert inRC(XMM, dst);
2729         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2730         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2731         emitByte(0x72);
2732         emitModRM(4, dst);
2733         emitByte(imm8);
2734     }
2735 
2736     public final void psrld(Register dst, int imm8) {
2737         assert isUByte(imm8) : "invalid value";
2738         assert inRC(XMM, dst);
2739         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2740         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2741         emitByte(0x72);
2742         emitModRM(2, dst);
2743         emitByte(imm8);
2744     }
2745 
2746     public final void psrlq(Register dst, int imm8) {
2747         assert isUByte(imm8) : "invalid value";
2748         assert inRC(XMM, dst);
2749         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2750         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2751         emitByte(0x73);
2752         emitModRM(2, dst);
2753         emitByte(imm8);
2754     }
2755 
2756     public final void psrldq(Register dst, int imm8) {
2757         assert isUByte(imm8) : "invalid value";
2758         assert inRC(XMM, dst);
2759         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2760         emitByte(0x73);
2761         emitModRM(3, dst);
2762         emitByte(imm8);
2763     }
2764 
2765     public final void pshufb(Register dst, Register src) {
2766         assert supports(CPUFeature.SSSE3);
2767         assert inRC(XMM, dst) && inRC(XMM, src);
2768         simdPrefix(dst, dst, src, PD, P_0F38, false);
2769         emitByte(0x00);
2770         emitModRM(dst, src);
2771     }
2772 
2773     public final void pshuflw(Register dst, Register src, int imm8) {
2774         assert supports(CPUFeature.SSE2);
2775         assert isUByte(imm8) : "invalid value";
2776         assert inRC(XMM, dst) && inRC(XMM, src);
2777         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2778         emitByte(0x70);
2779         emitModRM(dst, src);
2780         emitByte(imm8);
2781     }
2782 
2783     public final void pshufd(Register dst, Register src, int imm8) {
2784         assert isUByte(imm8) : "invalid value";
2785         assert inRC(XMM, dst) && inRC(XMM, src);
2786         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2787         emitByte(0x70);
2788         emitModRM(dst, src);
2789         emitByte(imm8);
2790     }
2791 
2792     public final void psubd(Register dst, Register src) {
2793         assert inRC(XMM, dst) && inRC(XMM, src);
2794         simdPrefix(dst, dst, src, PD, P_0F, false);
2795         emitByte(0xFA);
2796         emitModRM(dst, src);
2797     }
2798 
2799     public final void punpcklbw(Register dst, Register src) {
2800         assert supports(CPUFeature.SSE2);
2801         assert inRC(XMM, dst) && inRC(XMM, src);
2802         simdPrefix(dst, dst, src, PD, P_0F, false);
2803         emitByte(0x60);
2804         emitModRM(dst, src);
2805     }
2806 
2807     public final void rcpps(Register dst, Register src) {
2808         assert inRC(XMM, dst) && inRC(XMM, src);
2809         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2810         emitByte(0x53);
2811         emitModRM(dst, src);
2812     }
2813 
2814     public final void ret(int imm16) {
2815         if (imm16 == 0) {
2816             emitByte(0xC3);
2817         } else {
2818             emitByte(0xC2);
2819             emitShort(imm16);
2820         }
2821     }
2822 
2823     public final void sarl(Register dst, int imm8) {
2824         prefix(dst);
2825         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2826         if (imm8 == 1) {
2827             emitByte(0xD1);
2828             emitModRM(7, dst);
2829         } else {
2830             emitByte(0xC1);
2831             emitModRM(7, dst);
2832             emitByte(imm8);
2833         }
2834     }
2835 
2836     public final void shll(Register dst, int imm8) {
2837         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2838         prefix(dst);
2839         if (imm8 == 1) {
2840             emitByte(0xD1);
2841             emitModRM(4, dst);
2842         } else {
2843             emitByte(0xC1);
2844             emitModRM(4, dst);
2845             emitByte(imm8);
2846         }
2847     }
2848 
2849     public final void shll(Register dst) {
2850         // Multiply dst by 2, CL times.
2851         prefix(dst);
2852         emitByte(0xD3);
2853         emitModRM(4, dst);
2854     }
2855 
2856     // Insn: SHLX r32a, r/m32, r32b
2857 
2858     public final void shlxl(Register dst, Register src1, Register src2) {
2859         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2860     }
2861 
2862     public final void shrl(Register dst, int imm8) {
2863         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2864         prefix(dst);
2865         emitByte(0xC1);
2866         emitModRM(5, dst);
2867         emitByte(imm8);
2868     }
2869 
2870     public final void shrl(Register dst) {
2871         // Unsigned divide dst by 2, CL times.
2872         prefix(dst);
2873         emitByte(0xD3);
2874         emitModRM(5, dst);
2875     }
2876 
2877     public final void subl(AMD64Address dst, int imm32) {
2878         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2879     }
2880 
2881     public final void subl(Register dst, int imm32) {
2882         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2883     }
2884 
2885     public final void subl(Register dst, Register src) {
2886         SUB.rmOp.emit(this, DWORD, dst, src);
2887     }
2888 
2889     public final void subpd(Register dst, Register src) {
2890         SSEOp.SUB.emit(this, PD, dst, src);
2891     }
2892 
2893     public final void subsd(Register dst, Register src) {
2894         SSEOp.SUB.emit(this, SD, dst, src);
2895     }
2896 
2897     public final void subsd(Register dst, AMD64Address src) {
2898         SSEOp.SUB.emit(this, SD, dst, src);
2899     }
2900 
2901     public final void testl(Register dst, int imm32) {
2902         // not using emitArith because test
2903         // doesn't support sign-extension of
2904         // 8bit operands
2905         if (dst.encoding == 0) {
2906             emitByte(0xA9);
2907         } else {
2908             prefix(dst);
2909             emitByte(0xF7);
2910             emitModRM(0, dst);
2911         }
2912         emitInt(imm32);
2913     }
2914 
2915     public final void testl(Register dst, Register src) {
2916         prefix(dst, src);
2917         emitByte(0x85);
2918         emitModRM(dst, src);
2919     }
2920 
2921     public final void testl(Register dst, AMD64Address src) {
2922         prefix(src, dst);
2923         emitByte(0x85);
2924         emitOperandHelper(dst, src, 0);
2925     }
2926 
2927     public final void unpckhpd(Register dst, Register src) {
2928         assert inRC(XMM, dst) && inRC(XMM, src);
2929         simdPrefix(dst, dst, src, PD, P_0F, false);
2930         emitByte(0x15);
2931         emitModRM(dst, src);
2932     }
2933 
2934     public final void unpcklpd(Register dst, Register src) {
2935         assert inRC(XMM, dst) && inRC(XMM, src);
2936         simdPrefix(dst, dst, src, PD, P_0F, false);
2937         emitByte(0x14);
2938         emitModRM(dst, src);
2939     }
2940 
2941     public final void xorl(Register dst, Register src) {
2942         XOR.rmOp.emit(this, DWORD, dst, src);
2943     }
2944 
2945     public final void xorq(Register dst, Register src) {
2946         XOR.rmOp.emit(this, QWORD, dst, src);
2947     }
2948 
2949     public final void xorpd(Register dst, Register src) {
2950         SSEOp.XOR.emit(this, PD, dst, src);
2951     }
2952 
2953     public final void xorps(Register dst, Register src) {
2954         SSEOp.XOR.emit(this, PS, dst, src);
2955     }
2956 
2957     protected final void decl(Register dst) {
2958         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2959         prefix(dst);
2960         emitByte(0xFF);
2961         emitModRM(1, dst);
2962     }
2963 
2964     protected final void incl(Register dst) {
2965         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2966         prefix(dst);
2967         emitByte(0xFF);
2968         emitModRM(0, dst);
2969     }
2970 
2971     public final void addq(Register dst, int imm32) {
2972         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2973     }
2974 
2975     public final void addq(AMD64Address dst, int imm32) {
2976         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2977     }
2978 
2979     public final void addq(Register dst, Register src) {
2980         ADD.rmOp.emit(this, QWORD, dst, src);
2981     }
2982 
2983     public final void addq(AMD64Address dst, Register src) {
2984         ADD.mrOp.emit(this, QWORD, dst, src);
2985     }
2986 
2987     public final void andq(Register dst, int imm32) {
2988         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2989     }
2990 
2991     public final void bsrq(Register dst, Register src) {
2992         prefixq(dst, src);
2993         emitByte(0x0F);
2994         emitByte(0xBD);
2995         emitModRM(dst, src);
2996     }
2997 
2998     public final void bswapq(Register reg) {
2999         prefixq(reg);
3000         emitByte(0x0F);
3001         emitByte(0xC8 + encode(reg));
3002     }
3003 
3004     public final void cdqq() {
3005         rexw();
3006         emitByte(0x99);
3007     }
3008 
3009     public final void repStosb() {
3010         emitByte(0xf3);
3011         rexw();
3012         emitByte(0xaa);
3013     }
3014 
3015     public final void repStosq() {
3016         emitByte(0xf3);
3017         rexw();
3018         emitByte(0xab);
3019     }
3020 
3021     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
3022         prefixq(dst, src);
3023         emitByte(0x0F);
3024         emitByte(0x40 | cc.getValue());
3025         emitModRM(dst, src);
3026     }
3027 
3028     public final void setb(ConditionFlag cc, Register dst) {
3029         prefix(dst, true);
3030         emitByte(0x0F);
3031         emitByte(0x90 | cc.getValue());
3032         emitModRM(0, dst);
3033     }
3034 
3035     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
3036         prefixq(src, dst);
3037         emitByte(0x0F);
3038         emitByte(0x40 | cc.getValue());
3039         emitOperandHelper(dst, src, 0);
3040     }
3041 
3042     public final void cmpq(Register dst, int imm32) {
3043         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3044     }
3045 
3046     public final void cmpq(Register dst, Register src) {
3047         CMP.rmOp.emit(this, QWORD, dst, src);
3048     }
3049 
3050     public final void cmpq(Register dst, AMD64Address src) {
3051         CMP.rmOp.emit(this, QWORD, dst, src);
3052     }
3053 
3054     public final void cmpxchgq(Register reg, AMD64Address adr) {
3055         prefixq(adr, reg);
3056         emitByte(0x0F);
3057         emitByte(0xB1);
3058         emitOperandHelper(reg, adr, 0);
3059     }
3060 
3061     public final void cvtdq2pd(Register dst, Register src) {
3062         assert inRC(XMM, dst) && inRC(XMM, src);
3063         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3064         emitByte(0xE6);
3065         emitModRM(dst, src);
3066     }
3067 
3068     public final void cvtsi2sdq(Register dst, Register src) {
3069         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
3070     }
3071 
3072     public final void cvttsd2siq(Register dst, Register src) {
3073         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
3074     }
3075 
3076     public final void cvttpd2dq(Register dst, Register src) {
3077         assert inRC(XMM, dst) && inRC(XMM, src);
3078         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3079         emitByte(0xE6);
3080         emitModRM(dst, src);
3081     }
3082 
3083     public final void decq(Register dst) {
3084         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3085         prefixq(dst);
3086         emitByte(0xFF);
3087         emitModRM(1, dst);
3088     }
3089 
3090     public final void decq(AMD64Address dst) {
3091         DEC.emit(this, QWORD, dst);
3092     }
3093 
3094     public final void imulq(Register dst, Register src) {
3095         prefixq(dst, src);
3096         emitByte(0x0F);
3097         emitByte(0xAF);
3098         emitModRM(dst, src);
3099     }
3100 
3101     public final void incq(Register dst) {
3102         // Don't use it directly. Use Macroincrementq() instead.
3103         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3104         prefixq(dst);
3105         emitByte(0xFF);
3106         emitModRM(0, dst);
3107     }
3108 
3109     public final void incq(AMD64Address dst) {
3110         INC.emit(this, QWORD, dst);
3111     }
3112 
3113     public final void movq(Register dst, long imm64) {
3114         movq(dst, imm64, false);
3115     }
3116 
3117     public final void movq(Register dst, long imm64, boolean annotateImm) {
3118         int insnPos = position();
3119         prefixq(dst);
3120         emitByte(0xB8 + encode(dst));
3121         int immPos = position();
3122         emitLong(imm64);
3123         int nextInsnPos = position();
3124         if (annotateImm && codePatchingAnnotationConsumer != null) {
3125             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3126         }
3127     }
3128 
3129     public final void movslq(Register dst, int imm32) {
3130         prefixq(dst);
3131         emitByte(0xC7);
3132         emitModRM(0, dst);
3133         emitInt(imm32);
3134     }
3135 
3136     public final void movdq(Register dst, AMD64Address src) {
3137         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3138     }
3139 
3140     public final void movdq(AMD64Address dst, Register src) {
3141         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3142     }
3143 
3144     public final void movdq(Register dst, Register src) {
3145         if (inRC(XMM, dst) && inRC(CPU, src)) {
3146             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3147         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3148             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3149         } else {
3150             throw new InternalError("should not reach here");
3151         }
3152     }
3153 
3154     public final void movdl(Register dst, Register src) {
3155         if (inRC(XMM, dst) && inRC(CPU, src)) {
3156             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3157         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3158             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3159         } else {
3160             throw new InternalError("should not reach here");
3161         }
3162     }
3163 
3164     public final void movdl(Register dst, AMD64Address src) {
3165         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3166     }
3167 
3168     public final void movddup(Register dst, Register src) {
3169         assert supports(CPUFeature.SSE3);
3170         assert inRC(XMM, dst) && inRC(XMM, src);
3171         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3172         emitByte(0x12);
3173         emitModRM(dst, src);
3174     }
3175 
3176     public final void movdqu(Register dst, AMD64Address src) {
3177         assert inRC(XMM, dst);
3178         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3179         emitByte(0x6F);
3180         emitOperandHelper(dst, src, 0);
3181     }
3182 
3183     public final void movdqu(Register dst, Register src) {
3184         assert inRC(XMM, dst) && inRC(XMM, src);
3185         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3186         emitByte(0x6F);
3187         emitModRM(dst, src);
3188     }
3189 
3190     // Insn: VMOVDQU xmm2/m128, xmm1
3191 
3192     public final void movdqu(AMD64Address dst, Register src) {
3193         assert inRC(XMM, src);
3194         // Code: VEX.128.F3.0F.WIG 7F /r
3195         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3196         emitByte(0x7F);
3197         emitOperandHelper(src, dst, 0);
3198     }
3199 
3200     public final void movslq(AMD64Address dst, int imm32) {
3201         prefixq(dst);
3202         emitByte(0xC7);
3203         emitOperandHelper(0, dst, 4);
3204         emitInt(imm32);
3205     }
3206 
3207     public final void movslq(Register dst, AMD64Address src) {
3208         prefixq(src, dst);
3209         emitByte(0x63);
3210         emitOperandHelper(dst, src, 0);
3211     }
3212 
3213     public final void movslq(Register dst, Register src) {
3214         prefixq(dst, src);
3215         emitByte(0x63);
3216         emitModRM(dst, src);
3217     }
3218 
3219     public final void negq(Register dst) {
3220         prefixq(dst);
3221         emitByte(0xF7);
3222         emitModRM(3, dst);
3223     }
3224 
3225     public final void orq(Register dst, Register src) {
3226         OR.rmOp.emit(this, QWORD, dst, src);
3227     }
3228 
3229     public final void shlq(Register dst, int imm8) {
3230         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3231         prefixq(dst);
3232         if (imm8 == 1) {
3233             emitByte(0xD1);
3234             emitModRM(4, dst);
3235         } else {
3236             emitByte(0xC1);
3237             emitModRM(4, dst);
3238             emitByte(imm8);
3239         }
3240     }
3241 
3242     public final void shlq(Register dst) {
3243         // Multiply dst by 2, CL times.
3244         prefixq(dst);
3245         emitByte(0xD3);
3246         emitModRM(4, dst);
3247     }
3248 
3249     public final void shrq(Register dst, int imm8) {
3250         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3251         prefixq(dst);
3252         if (imm8 == 1) {
3253             emitByte(0xD1);
3254             emitModRM(5, dst);
3255         } else {
3256             emitByte(0xC1);
3257             emitModRM(5, dst);
3258             emitByte(imm8);
3259         }
3260     }
3261 
3262     public final void shrq(Register dst) {
3263         prefixq(dst);
3264         emitByte(0xD3);
3265         // Unsigned divide dst by 2, CL times.
3266         emitModRM(5, dst);
3267     }
3268 
3269     public final void sarq(Register dst, int imm8) {
3270         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3271         prefixq(dst);
3272         if (imm8 == 1) {
3273             emitByte(0xD1);
3274             emitModRM(7, dst);
3275         } else {
3276             emitByte(0xC1);
3277             emitModRM(7, dst);
3278             emitByte(imm8);
3279         }
3280     }
3281 
3282     public final void sbbq(Register dst, Register src) {
3283         SBB.rmOp.emit(this, QWORD, dst, src);
3284     }
3285 
3286     public final void subq(Register dst, int imm32) {
3287         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3288     }
3289 
3290     public final void subq(AMD64Address dst, int imm32) {
3291         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3292     }
3293 
3294     public final void subqWide(Register dst, int imm32) {
3295         // don't use the sign-extending version, forcing a 32-bit immediate
3296         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3297     }
3298 
3299     public final void subq(Register dst, Register src) {
3300         SUB.rmOp.emit(this, QWORD, dst, src);
3301     }
3302 
3303     public final void testq(Register dst, Register src) {
3304         prefixq(dst, src);
3305         emitByte(0x85);
3306         emitModRM(dst, src);
3307     }
3308 
3309     public final void btrq(Register src, int imm8) {
3310         prefixq(src);
3311         emitByte(0x0F);
3312         emitByte(0xBA);
3313         emitModRM(6, src);
3314         emitByte(imm8);
3315     }
3316 
3317     public final void xaddb(AMD64Address dst, Register src) {
3318         prefixb(dst, src);
3319         emitByte(0x0F);
3320         emitByte(0xC0);
3321         emitOperandHelper(src, dst, 0);
3322     }
3323 
3324     public final void xaddw(AMD64Address dst, Register src) {
3325         emitByte(0x66); // Switch to 16-bit mode.
3326         prefix(dst, src);
3327         emitByte(0x0F);
3328         emitByte(0xC1);
3329         emitOperandHelper(src, dst, 0);
3330     }
3331 
3332     public final void xaddl(AMD64Address dst, Register src) {
3333         prefix(dst, src);
3334         emitByte(0x0F);
3335         emitByte(0xC1);
3336         emitOperandHelper(src, dst, 0);
3337     }
3338 
3339     public final void xaddq(AMD64Address dst, Register src) {
3340         prefixq(dst, src);
3341         emitByte(0x0F);
3342         emitByte(0xC1);
3343         emitOperandHelper(src, dst, 0);
3344     }
3345 
3346     public final void xchgb(Register dst, AMD64Address src) {
3347         prefixb(src, dst);
3348         emitByte(0x86);
3349         emitOperandHelper(dst, src, 0);
3350     }
3351 
3352     public final void xchgw(Register dst, AMD64Address src) {
3353         emitByte(0x66);
3354         prefix(src, dst);
3355         emitByte(0x87);
3356         emitOperandHelper(dst, src, 0);
3357     }
3358 
3359     public final void xchgl(Register dst, AMD64Address src) {
3360         prefix(src, dst);
3361         emitByte(0x87);
3362         emitOperandHelper(dst, src, 0);
3363     }
3364 
3365     public final void xchgq(Register dst, AMD64Address src) {
3366         prefixq(src, dst);
3367         emitByte(0x87);
3368         emitOperandHelper(dst, src, 0);
3369     }
3370 
3371     public final void membar(int barriers) {
3372         if (target.isMP) {
3373             // We only have to handle StoreLoad
3374             if ((barriers & STORE_LOAD) != 0) {
3375                 // All usable chips support "locked" instructions which suffice
3376                 // as barriers, and are much faster than the alternative of
3377                 // using cpuid instruction. We use here a locked add [rsp],0.
3378                 // This is conveniently otherwise a no-op except for blowing
3379                 // flags.
3380                 // Any change to this code may need to revisit other places in
3381                 // the code where this idiom is used, in particular the
3382                 // orderAccess code.
3383                 lock();
3384                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3385             }
3386         }
3387     }
3388 
3389     @Override
3390     protected final void patchJumpTarget(int branch, int branchTarget) {
3391         int op = getByte(branch);
3392         assert op == 0xE8 // call
3393                         || op == 0x00 // jump table entry
3394                         || op == 0xE9 // jmp
3395                         || op == 0xEB // short jmp
3396                         || (op & 0xF0) == 0x70 // short jcc
3397                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3398         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3399 
3400         if (op == 0x00) {
3401             int offsetToJumpTableBase = getShort(branch + 1);
3402             int jumpTableBase = branch - offsetToJumpTableBase;
3403             int imm32 = branchTarget - jumpTableBase;
3404             emitInt(imm32, branch);
3405         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3406 
3407             // short offset operators (jmp and jcc)
3408             final int imm8 = branchTarget - (branch + 2);
3409             /*
3410              * Since a wrongly patched short branch can potentially lead to working but really bad
3411              * behaving code we should always fail with an exception instead of having an assert.
3412              */
3413             GraalError.guarantee(isByte(imm8), "Displacement too large to be encoded as a byte: %d", imm8);
3414             emitByte(imm8, branch + 1);
3415 
3416         } else {
3417 
3418             int off = 1;
3419             if (op == 0x0F) {
3420                 off = 2;
3421             }
3422 
3423             int imm32 = branchTarget - (branch + 4 + off);
3424             emitInt(imm32, branch + off);
3425         }
3426     }
3427 
3428     public void nullCheck(AMD64Address address) {
3429         testl(AMD64.rax, address);
3430     }
3431 
3432     @Override
3433     public void align(int modulus) {
3434         if (position() % modulus != 0) {
3435             nop(modulus - (position() % modulus));
3436         }
3437     }
3438 
3439     /**
3440      * Emits a direct call instruction. Note that the actual call target is not specified, because
3441      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3442      * responsible to add the call address to the appropriate patching tables.
3443      */
3444     public final void call() {
3445         annotatePatchingImmediate(1, 4);
3446         emitByte(0xE8);
3447         emitInt(0);
3448     }
3449 
3450     public final void call(Register src) {
3451         prefix(src);
3452         emitByte(0xFF);
3453         emitModRM(2, src);
3454     }
3455 
3456     public final void int3() {
3457         emitByte(0xCC);
3458     }
3459 
3460     public final void pause() {
3461         emitByte(0xF3);
3462         emitByte(0x90);
3463     }
3464 
3465     private void emitx87(int b1, int b2, int i) {
3466         assert 0 <= i && i < 8 : "illegal stack offset";
3467         emitByte(b1);
3468         emitByte(b2 + i);
3469     }
3470 
3471     public final void fldd(AMD64Address src) {
3472         emitByte(0xDD);
3473         emitOperandHelper(0, src, 0);
3474     }
3475 
3476     public final void flds(AMD64Address src) {
3477         emitByte(0xD9);
3478         emitOperandHelper(0, src, 0);
3479     }
3480 
3481     public final void fldln2() {
3482         emitByte(0xD9);
3483         emitByte(0xED);
3484     }
3485 
3486     public final void fldlg2() {
3487         emitByte(0xD9);
3488         emitByte(0xEC);
3489     }
3490 
3491     public final void fyl2x() {
3492         emitByte(0xD9);
3493         emitByte(0xF1);
3494     }
3495 
3496     public final void fstps(AMD64Address src) {
3497         emitByte(0xD9);
3498         emitOperandHelper(3, src, 0);
3499     }
3500 
3501     public final void fstpd(AMD64Address src) {
3502         emitByte(0xDD);
3503         emitOperandHelper(3, src, 0);
3504     }
3505 
3506     private void emitFPUArith(int b1, int b2, int i) {
3507         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3508         emitByte(b1);
3509         emitByte(b2 + i);
3510     }
3511 
3512     public void ffree(int i) {
3513         emitFPUArith(0xDD, 0xC0, i);
3514     }
3515 
3516     public void fincstp() {
3517         emitByte(0xD9);
3518         emitByte(0xF7);
3519     }
3520 
3521     public void fxch(int i) {
3522         emitFPUArith(0xD9, 0xC8, i);
3523     }
3524 
3525     public void fnstswAX() {
3526         emitByte(0xDF);
3527         emitByte(0xE0);
3528     }
3529 
3530     public void fwait() {
3531         emitByte(0x9B);
3532     }
3533 
3534     public void fprem() {
3535         emitByte(0xD9);
3536         emitByte(0xF8);
3537     }
3538 
3539     public final void fsin() {
3540         emitByte(0xD9);
3541         emitByte(0xFE);
3542     }
3543 
3544     public final void fcos() {
3545         emitByte(0xD9);
3546         emitByte(0xFF);
3547     }
3548 
3549     public final void fptan() {
3550         emitByte(0xD9);
3551         emitByte(0xF2);
3552     }
3553 
3554     public final void fstp(int i) {
3555         emitx87(0xDD, 0xD8, i);
3556     }
3557 
3558     @Override
3559     public AMD64Address makeAddress(Register base, int displacement) {
3560         return new AMD64Address(base, displacement);
3561     }
3562 
3563     @Override
3564     public AMD64Address getPlaceholder(int instructionStartPosition) {
3565         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3566     }
3567 
3568     private void prefetchPrefix(AMD64Address src) {
3569         prefix(src);
3570         emitByte(0x0F);
3571     }
3572 
3573     public void prefetchnta(AMD64Address src) {
3574         prefetchPrefix(src);
3575         emitByte(0x18);
3576         emitOperandHelper(0, src, 0);
3577     }
3578 
3579     void prefetchr(AMD64Address src) {
3580         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3581         prefetchPrefix(src);
3582         emitByte(0x0D);
3583         emitOperandHelper(0, src, 0);
3584     }
3585 
3586     public void prefetcht0(AMD64Address src) {
3587         assert supports(CPUFeature.SSE);
3588         prefetchPrefix(src);
3589         emitByte(0x18);
3590         emitOperandHelper(1, src, 0);
3591     }
3592 
3593     public void prefetcht1(AMD64Address src) {
3594         assert supports(CPUFeature.SSE);
3595         prefetchPrefix(src);
3596         emitByte(0x18);
3597         emitOperandHelper(2, src, 0);
3598     }
3599 
3600     public void prefetcht2(AMD64Address src) {
3601         assert supports(CPUFeature.SSE);
3602         prefix(src);
3603         emitByte(0x0f);
3604         emitByte(0x18);
3605         emitOperandHelper(3, src, 0);
3606     }
3607 
3608     public void prefetchw(AMD64Address src) {
3609         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3610         prefix(src);
3611         emitByte(0x0f);
3612         emitByte(0x0D);
3613         emitOperandHelper(1, src, 0);
3614     }
3615 
3616     public void rdtsc() {
3617         emitByte(0x0F);
3618         emitByte(0x31);
3619     }
3620 
3621     /**
3622      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3623      * to crash the program (debugging etc.).
3624      */
3625     public void illegal() {
3626         emitByte(0x0f);
3627         emitByte(0x0b);
3628     }
3629 
3630     public void lfence() {
3631         emitByte(0x0f);
3632         emitByte(0xae);
3633         emitByte(0xe8);
3634     }
3635 
3636     public final void vptest(Register dst, Register src) {
3637         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3638     }
3639 
3640     public final void vpxor(Register dst, Register nds, Register src) {
3641         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3642     }
3643 
3644     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3645         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3646     }
3647 
3648     public final void vmovdqu(Register dst, AMD64Address src) {
3649         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3650     }
3651 
3652     public final void vmovdqu(AMD64Address dst, Register src) {
3653         assert inRC(XMM, src);
3654         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3655     }
3656 
3657     public final void vpmovzxbw(Register dst, AMD64Address src) {
3658         assert supports(CPUFeature.AVX2);
3659         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3660     }
3661 
3662     public final void vzeroupper() {
3663         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3664         emitByte(0x77);
3665     }
3666 
3667     // Insn: KORTESTD k1, k2
3668 
3669     // This instruction produces ZF or CF flags
3670     public final void kortestd(Register src1, Register src2) {
3671         assert supports(CPUFeature.AVX512BW);
3672         assert inRC(MASK, src1) && inRC(MASK, src2);
3673         // Code: VEX.L0.66.0F.W1 98 /r
3674         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3675         emitByte(0x98);
3676         emitModRM(src1, src2);
3677     }
3678 
3679     // Insn: KORTESTQ k1, k2
3680 
3681     // This instruction produces ZF or CF flags
3682     public final void kortestq(Register src1, Register src2) {
3683         assert supports(CPUFeature.AVX512BW);
3684         assert inRC(MASK, src1) && inRC(MASK, src2);
3685         // Code: VEX.L0.0F.W1 98 /r
3686         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3687         emitByte(0x98);
3688         emitModRM(src1, src2);
3689     }
3690 
3691     public final void kmovd(Register dst, Register src) {
3692         assert supports(CPUFeature.AVX512BW);
3693         assert inRC(MASK, dst) || inRC(CPU, dst);
3694         assert inRC(MASK, src) || inRC(CPU, src);
3695         assert !(inRC(CPU, dst) && inRC(CPU, src));
3696 
3697         if (inRC(MASK, dst)) {
3698             if (inRC(MASK, src)) {
3699                 // kmovd(KRegister dst, KRegister src):
3700                 // Insn: KMOVD k1, k2/m32
3701                 // Code: VEX.L0.66.0F.W1 90 /r
3702                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3703                 emitByte(0x90);
3704                 emitModRM(dst, src);
3705             } else {
3706                 // kmovd(KRegister dst, Register src)
3707                 // Insn: KMOVD k1, r32
3708                 // Code: VEX.L0.F2.0F.W0 92 /r
3709                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3710                 emitByte(0x92);
3711                 emitModRM(dst, src);
3712             }
3713         } else {
3714             if (inRC(MASK, src)) {
3715                 // kmovd(Register dst, KRegister src)
3716                 // Insn: KMOVD r32, k1
3717                 // Code: VEX.L0.F2.0F.W0 93 /r
3718                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3719                 emitByte(0x93);
3720                 emitModRM(dst, src);
3721             } else {
3722                 throw GraalError.shouldNotReachHere();
3723             }
3724         }
3725     }
3726 
3727     public final void kmovq(Register dst, Register src) {
3728         assert supports(CPUFeature.AVX512BW);
3729         assert inRC(MASK, dst) || inRC(CPU, dst);
3730         assert inRC(MASK, src) || inRC(CPU, src);
3731         assert !(inRC(CPU, dst) && inRC(CPU, src));
3732 
3733         if (inRC(MASK, dst)) {
3734             if (inRC(MASK, src)) {
3735                 // kmovq(KRegister dst, KRegister src):
3736                 // Insn: KMOVQ k1, k2/m64
3737                 // Code: VEX.L0.0F.W1 90 /r
3738                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3739                 emitByte(0x90);
3740                 emitModRM(dst, src);
3741             } else {
3742                 // kmovq(KRegister dst, Register src)
3743                 // Insn: KMOVQ k1, r64
3744                 // Code: VEX.L0.F2.0F.W1 92 /r
3745                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3746                 emitByte(0x92);
3747                 emitModRM(dst, src);
3748             }
3749         } else {
3750             if (inRC(MASK, src)) {
3751                 // kmovq(Register dst, KRegister src)
3752                 // Insn: KMOVQ r64, k1
3753                 // Code: VEX.L0.F2.0F.W1 93 /r
3754                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3755                 emitByte(0x93);
3756                 emitModRM(dst, src);
3757             } else {
3758                 throw GraalError.shouldNotReachHere();
3759             }
3760         }
3761     }
3762 
3763     // Insn: KTESTD k1, k2
3764 
3765     public final void ktestd(Register src1, Register src2) {
3766         assert supports(CPUFeature.AVX512BW);
3767         assert inRC(MASK, src1) && inRC(MASK, src2);
3768         // Code: VEX.L0.66.0F.W1 99 /r
3769         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3770         emitByte(0x99);
3771         emitModRM(src1, src2);
3772     }
3773 
3774     public final void evmovdqu64(Register dst, AMD64Address src) {
3775         assert supports(CPUFeature.AVX512F);
3776         assert inRC(XMM, dst);
3777         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3778         emitByte(0x6F);
3779         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3780     }
3781 
3782     // Insn: VPMOVZXBW zmm1, m256
3783 
3784     public final void evpmovzxbw(Register dst, AMD64Address src) {
3785         assert supports(CPUFeature.AVX512BW);
3786         assert inRC(XMM, dst);
3787         // Code: EVEX.512.66.0F38.WIG 30 /r
3788         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3789         emitByte(0x30);
3790         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3791     }
3792 
3793     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3794         assert supports(CPUFeature.AVX512BW);
3795         assert inRC(MASK, kdst) && inRC(XMM, nds);
3796         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3797         emitByte(0x74);
3798         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3799     }
3800 
3801     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3802     // -----
3803     // Insn: VMOVDQU16 zmm1, m512
3804 
3805     public final void evmovdqu16(Register dst, AMD64Address src) {
3806         assert supports(CPUFeature.AVX512BW);
3807         assert inRC(XMM, dst);
3808         // Code: EVEX.512.F2.0F.W1 6F /r
3809         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3810         emitByte(0x6F);
3811         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3812     }
3813 
3814     // Insn: VMOVDQU16 zmm1, k1:z, m512
3815 
3816     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3817         assert supports(CPUFeature.AVX512BW);
3818         assert inRC(XMM, dst) && inRC(MASK, mask);
3819         // Code: EVEX.512.F2.0F.W1 6F /r
3820         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3821         emitByte(0x6F);
3822         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3823     }
3824 
3825     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3826     // -----
3827     // Insn: VMOVDQU16 m512, zmm1
3828 
3829     public final void evmovdqu16(AMD64Address dst, Register src) {
3830         assert supports(CPUFeature.AVX512BW);
3831         assert inRC(XMM, src);
3832         // Code: EVEX.512.F2.0F.W1 7F /r
3833         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3834         emitByte(0x7F);
3835         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3836     }
3837 
3838     // Insn: VMOVDQU16 m512, k1, zmm1
3839 
3840     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3841         assert supports(CPUFeature.AVX512BW);
3842         assert inRC(MASK, mask) && inRC(XMM, src);
3843         // Code: EVEX.512.F2.0F.W1 7F /r
3844         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3845         emitByte(0x7F);
3846         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3847     }
3848 
3849     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3850     // -----
3851     // Insn: VPBROADCASTW zmm1, reg
3852 
3853     public final void evpbroadcastw(Register dst, Register src) {
3854         assert supports(CPUFeature.AVX512BW);
3855         assert inRC(XMM, dst) && inRC(CPU, src);
3856         // Code: EVEX.512.66.0F38.W0 7B /r
3857         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3858         emitByte(0x7B);
3859         emitModRM(dst, src);
3860     }
3861 
3862     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3863     // -----
3864     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3865 
3866     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3867         assert supports(CPUFeature.AVX512BW);
3868         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3869         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3870         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3871         emitByte(0x3E);
3872         emitModRM(kdst, src);
3873         emitByte(vcc);
3874     }
3875 
3876     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3877     // -----
3878     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3879 
3880     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3881         assert supports(CPUFeature.AVX512BW);
3882         assert inRC(MASK, kdst) && inRC(MASK, mask);
3883         assert inRC(XMM, nds) && inRC(XMM, src);
3884         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3885         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3886         emitByte(0x3E);
3887         emitModRM(kdst, src);
3888         emitByte(vcc);
3889     }
3890 
3891     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3892     // -----
3893     // Insn: VPMOVWB m256, zmm2
3894 
3895     public final void evpmovwb(AMD64Address dst, Register src) {
3896         assert supports(CPUFeature.AVX512BW);
3897         assert inRC(XMM, src);
3898         // Code: EVEX.512.F3.0F38.W0 30 /r
3899         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3900         emitByte(0x30);
3901         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3902     }
3903 
3904     // Insn: VPMOVWB m256, k1, zmm2
3905 
3906     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
3907         assert supports(CPUFeature.AVX512BW);
3908         assert inRC(MASK, mask) && inRC(XMM, src);
3909         // Code: EVEX.512.F3.0F38.W0 30 /r
3910         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3911         emitByte(0x30);
3912         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3913     }
3914 
3915     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
3916     // -----
3917     // Insn: VPMOVZXBW zmm1, k1, m256
3918 
3919     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
3920         assert supports(CPUFeature.AVX512BW);
3921         assert inRC(MASK, mask) && inRC(XMM, dst);
3922         // Code: EVEX.512.66.0F38.WIG 30 /r
3923         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3924         emitByte(0x30);
3925         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3926     }
3927 
3928 }