1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  68 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  69 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  70 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  71 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  72 
  73 import java.util.EnumSet;
  74 
  75 import org.graalvm.compiler.asm.Label;
  76 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  77 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  78 import org.graalvm.compiler.core.common.calc.Condition;
  79 import org.graalvm.compiler.debug.GraalError;
  80 
  81 import jdk.vm.ci.amd64.AMD64;
  82 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  83 import jdk.vm.ci.code.Register;
  84 import jdk.vm.ci.code.Register.RegisterCategory;
  85 import jdk.vm.ci.code.TargetDescription;
  86 
  87 /**
  88  * This class implements an assembler that can encode most X86 instructions.
  89  */
  90 public class AMD64Assembler extends AMD64BaseAssembler {
  91 
  92     /**
  93      * Constructs an assembler for the AMD64 architecture.
  94      */
  95     public AMD64Assembler(TargetDescription target) {
  96         super(target);
  97     }
  98 
  99     /**
 100      * The x86 condition codes used for conditional jumps/moves.
 101      */
 102     public enum ConditionFlag {
 103         Zero(0x4, "|zero|"),
 104         NotZero(0x5, "|nzero|"),
 105         Equal(0x4, "="),
 106         NotEqual(0x5, "!="),
 107         Less(0xc, "<"),
 108         LessEqual(0xe, "<="),
 109         Greater(0xf, ">"),
 110         GreaterEqual(0xd, ">="),
 111         Below(0x2, "|<|"),
 112         BelowEqual(0x6, "|<=|"),
 113         Above(0x7, "|>|"),
 114         AboveEqual(0x3, "|>=|"),
 115         Overflow(0x0, "|of|"),
 116         NoOverflow(0x1, "|nof|"),
 117         CarrySet(0x2, "|carry|"),
 118         CarryClear(0x3, "|ncarry|"),
 119         Negative(0x8, "|neg|"),
 120         Positive(0x9, "|pos|"),
 121         Parity(0xa, "|par|"),
 122         NoParity(0xb, "|npar|");
 123 
 124         private final int value;
 125         private final String operator;
 126 
 127         ConditionFlag(int value, String operator) {
 128             this.value = value;
 129             this.operator = operator;
 130         }
 131 
 132         public ConditionFlag negate() {
 133             switch (this) {
 134                 case Zero:
 135                     return NotZero;
 136                 case NotZero:
 137                     return Zero;
 138                 case Equal:
 139                     return NotEqual;
 140                 case NotEqual:
 141                     return Equal;
 142                 case Less:
 143                     return GreaterEqual;
 144                 case LessEqual:
 145                     return Greater;
 146                 case Greater:
 147                     return LessEqual;
 148                 case GreaterEqual:
 149                     return Less;
 150                 case Below:
 151                     return AboveEqual;
 152                 case BelowEqual:
 153                     return Above;
 154                 case Above:
 155                     return BelowEqual;
 156                 case AboveEqual:
 157                     return Below;
 158                 case Overflow:
 159                     return NoOverflow;
 160                 case NoOverflow:
 161                     return Overflow;
 162                 case CarrySet:
 163                     return CarryClear;
 164                 case CarryClear:
 165                     return CarrySet;
 166                 case Negative:
 167                     return Positive;
 168                 case Positive:
 169                     return Negative;
 170                 case Parity:
 171                     return NoParity;
 172                 case NoParity:
 173                     return Parity;
 174             }
 175             throw new IllegalArgumentException();
 176         }
 177 
 178         public int getValue() {
 179             return value;
 180         }
 181 
 182         @Override
 183         public String toString() {
 184             return operator;
 185         }
 186     }
 187 
 188     /**
 189      * Operand size and register type constraints.
 190      */
 191     private enum OpAssertion {
 192         ByteAssertion(CPU, CPU, BYTE),
 193         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 194         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 195         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 196         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 197         QwordAssertion(CPU, CPU, QWORD),
 198         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 199         PackedFloatAssertion(XMM, XMM, PS, PD),
 200         SingleAssertion(XMM, XMM, SS),
 201         DoubleAssertion(XMM, XMM, SD),
 202         PackedDoubleAssertion(XMM, XMM, PD),
 203         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 204         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 205 
 206         private final RegisterCategory resultCategory;
 207         private final RegisterCategory inputCategory;
 208         private final OperandSize[] allowedSizes;
 209 
 210         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 211             this.resultCategory = resultCategory;
 212             this.inputCategory = inputCategory;
 213             this.allowedSizes = allowedSizes;
 214         }
 215 
 216         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 217             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 218             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 219 
 220             for (OperandSize s : allowedSizes) {
 221                 if (size == s) {
 222                     return true;
 223                 }
 224             }
 225 
 226             assert false : "invalid operand size " + size + " used in " + op;
 227             return false;
 228         }
 229 
 230     }
 231 
 232     protected static final int P_0F = 0x0F;
 233     protected static final int P_0F38 = 0x380F;
 234     protected static final int P_0F3A = 0x3A0F;
 235 
 236     /**
 237      * Base class for AMD64 opcodes.
 238      */
 239     public static class AMD64Op {
 240 
 241         private final String opcode;
 242 
 243         protected final int prefix1;
 244         protected final int prefix2;
 245         protected final int op;
 246 
 247         private final boolean dstIsByte;
 248         private final boolean srcIsByte;
 249 
 250         private final OpAssertion assertion;
 251         private final CPUFeature feature;
 252 
 253         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 254             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 255         }
 256 
 257         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 258             this.opcode = opcode;
 259             this.prefix1 = prefix1;
 260             this.prefix2 = prefix2;
 261             this.op = op;
 262 
 263             this.dstIsByte = dstIsByte;
 264             this.srcIsByte = srcIsByte;
 265 
 266             this.assertion = assertion;
 267             this.feature = feature;
 268         }
 269 
 270         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 271             if (prefix1 != 0) {
 272                 asm.emitByte(prefix1);
 273             }
 274             if (size.getSizePrefix() != 0) {
 275                 asm.emitByte(size.getSizePrefix());
 276             }
 277             int rexPrefix = 0x40 | rxb;
 278             if (size == QWORD) {
 279                 rexPrefix |= 0x08;
 280             }
 281             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 282                 asm.emitByte(rexPrefix);
 283             }
 284             if (prefix2 > 0xFF) {
 285                 asm.emitShort(prefix2);
 286             } else if (prefix2 > 0) {
 287                 asm.emitByte(prefix2);
 288             }
 289             asm.emitByte(op);
 290         }
 291 
 292         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 293             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 294             assert assertion.checkOperands(this, size, resultReg, inputReg);
 295             return true;
 296         }
 297 
 298         public OperandSize[] getAllowedSizes() {
 299             return assertion.allowedSizes;
 300         }
 301 
 302         protected final boolean isSSEInstruction() {
 303             if (feature == null) {
 304                 return false;
 305             }
 306             switch (feature) {
 307                 case SSE:
 308                 case SSE2:
 309                 case SSE3:
 310                 case SSSE3:
 311                 case SSE4A:
 312                 case SSE4_1:
 313                 case SSE4_2:
 314                     return true;
 315                 default:
 316                     return false;
 317             }
 318         }
 319 
 320         public final OpAssertion getAssertion() {
 321             return assertion;
 322         }
 323 
 324         @Override
 325         public String toString() {
 326             return opcode;
 327         }
 328     }
 329 
 330     /**
 331      * Base class for AMD64 opcodes with immediate operands.
 332      */
 333     public static class AMD64ImmOp extends AMD64Op {
 334 
 335         private final boolean immIsByte;
 336 
 337         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 338             this(opcode, immIsByte, prefix, op, assertion, null);
 339         }
 340 
 341         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 342             super(opcode, 0, prefix, op, assertion, feature);
 343             this.immIsByte = immIsByte;
 344         }
 345 
 346         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 347             if (immIsByte) {
 348                 assert imm == (byte) imm;
 349                 asm.emitByte(imm);
 350             } else {
 351                 size.emitImmediate(asm, imm);
 352             }
 353         }
 354 
 355         protected final int immediateSize(OperandSize size) {
 356             if (immIsByte) {
 357                 return 1;
 358             } else {
 359                 return size.getBytes();
 360             }
 361         }
 362     }
 363 
 364     /**
 365      * Opcode with operand order of either RM or MR for 2 address forms.
 366      */
 367     public abstract static class AMD64RROp extends AMD64Op {
 368 
 369         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 370             super(opcode, prefix1, prefix2, op, assertion, feature);
 371         }
 372 
 373         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 374             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 375         }
 376 
 377         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 378     }
 379 
 380     /**
 381      * Opcode with operand order of RM.
 382      */
 383     public static class AMD64RMOp extends AMD64RROp {
 384         // @formatter:off
 385         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 386         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 387         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 388         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 389         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 390         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 391         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 392         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 393         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 394         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 395         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 396         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 399         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 400         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 401         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 402 
 403         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 404         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 405         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 407         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408 
 409         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 410         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 411         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 412         // @formatter:on
 413 
 414         protected AMD64RMOp(String opcode, int op) {
 415             this(opcode, 0, op);
 416         }
 417 
 418         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 419             this(opcode, 0, op, assertion);
 420         }
 421 
 422         protected AMD64RMOp(String opcode, int prefix, int op) {
 423             this(opcode, 0, prefix, op, null);
 424         }
 425 
 426         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 427             this(opcode, 0, prefix, op, assertion, null);
 428         }
 429 
 430         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 431             this(opcode, 0, prefix, op, assertion, feature);
 432         }
 433 
 434         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 435             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 436         }
 437 
 438         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 439             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 440         }
 441 
 442         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 443             super(opcode, prefix1, prefix2, op, assertion, feature);
 444         }
 445 
 446         @Override
 447         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 448             assert verify(asm, size, dst, src);
 449             if (isSSEInstruction()) {
 450                 Register nds = Register.None;
 451                 switch (op) {
 452                     case 0x10:
 453                     case 0x51:
 454                         if ((size == SS) || (size == SD)) {
 455                             nds = dst;
 456                         }
 457                         break;
 458                     case 0x2A:
 459                     case 0x54:
 460                     case 0x55:
 461                     case 0x56:
 462                     case 0x57:
 463                     case 0x58:
 464                     case 0x59:
 465                     case 0x5A:
 466                     case 0x5C:
 467                     case 0x5D:
 468                     case 0x5E:
 469                     case 0x5F:
 470                         nds = dst;
 471                         break;
 472                     default:
 473                         break;
 474                 }
 475                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 476                 asm.emitByte(op);
 477                 asm.emitModRM(dst, src);
 478             } else {
 479                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 480                 asm.emitModRM(dst, src);
 481             }
 482         }
 483 
 484         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 485             assert verify(asm, size, dst, null);
 486             if (isSSEInstruction()) {
 487                 Register nds = Register.None;
 488                 switch (op) {
 489                     case 0x51:
 490                         if ((size == SS) || (size == SD)) {
 491                             nds = dst;
 492                         }
 493                         break;
 494                     case 0x2A:
 495                     case 0x54:
 496                     case 0x55:
 497                     case 0x56:
 498                     case 0x57:
 499                     case 0x58:
 500                     case 0x59:
 501                     case 0x5A:
 502                     case 0x5C:
 503                     case 0x5D:
 504                     case 0x5E:
 505                     case 0x5F:
 506                         nds = dst;
 507                         break;
 508                     default:
 509                         break;
 510                 }
 511                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 512                 asm.emitByte(op);
 513                 asm.emitOperandHelper(dst, src, 0);
 514             } else {
 515                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 516                 asm.emitOperandHelper(dst, src, 0);
 517             }
 518         }
 519     }
 520 
 521     /**
 522      * Opcode with operand order of MR.
 523      */
 524     public static class AMD64MROp extends AMD64RROp {
 525         // @formatter:off
 526         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 527         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 528 
 529         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 530         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 531         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 532         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533 
 534         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 535         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 536         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         // @formatter:on
 538 
 539         protected AMD64MROp(String opcode, int op) {
 540             this(opcode, 0, op);
 541         }
 542 
 543         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 544             this(opcode, 0, op, assertion);
 545         }
 546 
 547         protected AMD64MROp(String opcode, int prefix, int op) {
 548             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 549         }
 550 
 551         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 552             this(opcode, prefix, op, assertion, null);
 553         }
 554 
 555         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 556             this(opcode, 0, prefix, op, assertion, feature);
 557         }
 558 
 559         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 560             super(opcode, prefix1, prefix2, op, assertion, feature);
 561         }
 562 
 563         @Override
 564         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 565             assert verify(asm, size, src, dst);
 566             if (isSSEInstruction()) {
 567                 Register nds = Register.None;
 568                 switch (op) {
 569                     case 0x11:
 570                         if ((size == SS) || (size == SD)) {
 571                             nds = src;
 572                         }
 573                         break;
 574                     default:
 575                         break;
 576                 }
 577                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 578                 asm.emitByte(op);
 579                 asm.emitModRM(src, dst);
 580             } else {
 581                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 582                 asm.emitModRM(src, dst);
 583             }
 584         }
 585 
 586         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 587             assert verify(asm, size, src, null);
 588             if (isSSEInstruction()) {
 589                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 590                 asm.emitByte(op);
 591             } else {
 592                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 593             }
 594             asm.emitOperandHelper(src, dst, 0);
 595         }
 596     }
 597 
 598     /**
 599      * Opcodes with operand order of M.
 600      */
 601     public static class AMD64MOp extends AMD64Op {
 602         // @formatter:off
 603         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 604         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 605         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 606         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 607         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 608         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 609         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 610         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 611         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 612         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 613         // @formatter:on
 614 
 615         private final int ext;
 616 
 617         protected AMD64MOp(String opcode, int op, int ext) {
 618             this(opcode, 0, op, ext);
 619         }
 620 
 621         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 622             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 623         }
 624 
 625         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 626             this(opcode, 0, op, ext, assertion);
 627         }
 628 
 629         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 630             super(opcode, 0, prefix, op, assertion, null);
 631             this.ext = ext;
 632         }
 633 
 634         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 635             assert verify(asm, size, dst, null);
 636             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 637             asm.emitModRM(ext, dst);
 638         }
 639 
 640         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 641             assert verify(asm, size, null, null);
 642             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 643             asm.emitOperandHelper(ext, dst, 0);
 644         }
 645     }
 646 
 647     /**
 648      * Opcodes with operand order of MI.
 649      */
 650     public static class AMD64MIOp extends AMD64ImmOp {
 651         // @formatter:off
 652         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 653         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 654         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 655         // @formatter:on
 656 
 657         private final int ext;
 658 
 659         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 660             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 661         }
 662 
 663         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 664             this(opcode, immIsByte, 0, op, ext, assertion);
 665         }
 666 
 667         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 668             super(opcode, immIsByte, prefix, op, assertion);
 669             this.ext = ext;
 670         }
 671 
 672         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 673             emit(asm, size, dst, imm, false);
 674         }
 675 
 676         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 677             assert verify(asm, size, dst, null);
 678             int insnPos = asm.position();
 679             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 680             asm.emitModRM(ext, dst);
 681             int immPos = asm.position();
 682             emitImmediate(asm, size, imm);
 683             int nextInsnPos = asm.position();
 684             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 685                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 686             }
 687         }
 688 
 689         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 690             emit(asm, size, dst, imm, false);
 691         }
 692 
 693         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 694             assert verify(asm, size, null, null);
 695             int insnPos = asm.position();
 696             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 697             asm.emitOperandHelper(ext, dst, immediateSize(size));
 698             int immPos = asm.position();
 699             emitImmediate(asm, size, imm);
 700             int nextInsnPos = asm.position();
 701             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 702                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 703             }
 704         }
 705     }
 706 
 707     /**
 708      * Opcodes with operand order of RMI.
 709      *
 710      * We only have one form of round as the operation is always treated with single variant input,
 711      * making its extension to 3 address forms redundant.
 712      */
 713     public static class AMD64RMIOp extends AMD64ImmOp {
 714         // @formatter:off
 715         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 716         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 717         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 718         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         // @formatter:on
 720 
 721         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 722             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 723         }
 724 
 725         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 726             super(opcode, immIsByte, prefix, op, assertion, feature);
 727         }
 728 
 729         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 730             assert verify(asm, size, dst, src);
 731             if (isSSEInstruction()) {
 732                 Register nds = Register.None;
 733                 switch (op) {
 734                     case 0x0A:
 735                     case 0x0B:
 736                         nds = dst;
 737                         break;
 738                     default:
 739                         break;
 740                 }
 741                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 742                 asm.emitByte(op);
 743                 asm.emitModRM(dst, src);
 744             } else {
 745                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 746                 asm.emitModRM(dst, src);
 747             }
 748             emitImmediate(asm, size, imm);
 749         }
 750 
 751         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 752             assert verify(asm, size, dst, null);
 753             if (isSSEInstruction()) {
 754                 Register nds = Register.None;
 755                 switch (op) {
 756                     case 0x0A:
 757                     case 0x0B:
 758                         nds = dst;
 759                         break;
 760                     default:
 761                         break;
 762                 }
 763                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 764                 asm.emitByte(op);
 765             } else {
 766                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 767             }
 768             asm.emitOperandHelper(dst, src, immediateSize(size));
 769             emitImmediate(asm, size, imm);
 770         }
 771     }
 772 
 773     public static class SSEOp extends AMD64RMOp {
 774         // @formatter:off
 775         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 776         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 778         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 780         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 781         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 782         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 786         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 787         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 788         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 789         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 790         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 791         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 792         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 793         // @formatter:on
 794 
 795         protected SSEOp(String opcode, int prefix, int op) {
 796             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 797         }
 798 
 799         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 800             this(opcode, 0, prefix, op, assertion);
 801         }
 802 
 803         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 804             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 805         }
 806     }
 807 
 808     /**
 809      * Arithmetic operation with operand order of RM, MR or MI.
 810      */
 811     public static final class AMD64BinaryArithmetic {
 812         // @formatter:off
 813         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 814         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 815         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 816         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 817         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 818         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 819         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 820         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 821         // @formatter:on
 822 
 823         private final AMD64MIOp byteImmOp;
 824         private final AMD64MROp byteMrOp;
 825         private final AMD64RMOp byteRmOp;
 826 
 827         private final AMD64MIOp immOp;
 828         private final AMD64MIOp immSxOp;
 829         private final AMD64MROp mrOp;
 830         private final AMD64RMOp rmOp;
 831 
 832         private AMD64BinaryArithmetic(String opcode, int code) {
 833             int baseOp = code << 3;
 834 
 835             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 836             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 837             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 838 
 839             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 840             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 841             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 842             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 843         }
 844 
 845         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 846             if (size == BYTE) {
 847                 return byteImmOp;
 848             } else if (sx) {
 849                 return immSxOp;
 850             } else {
 851                 return immOp;
 852             }
 853         }
 854 
 855         public AMD64MROp getMROpcode(OperandSize size) {
 856             if (size == BYTE) {
 857                 return byteMrOp;
 858             } else {
 859                 return mrOp;
 860             }
 861         }
 862 
 863         public AMD64RMOp getRMOpcode(OperandSize size) {
 864             if (size == BYTE) {
 865                 return byteRmOp;
 866             } else {
 867                 return rmOp;
 868             }
 869         }
 870     }
 871 
 872     /**
 873      * Shift operation with operand order of M1, MC or MI.
 874      */
 875     public static final class AMD64Shift {
 876         // @formatter:off
 877         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 878         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 879         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 880         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 881         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 882         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 883         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 884         // @formatter:on
 885 
 886         public final AMD64MOp m1Op;
 887         public final AMD64MOp mcOp;
 888         public final AMD64MIOp miOp;
 889 
 890         private AMD64Shift(String opcode, int code) {
 891             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 892             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 893             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 894         }
 895     }
 896 
 897     private enum VEXOpAssertion {
 898         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 899         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 900         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 901         AVX1_128ONLY(CPUFeature.AVX, null),
 902         AVX1_256ONLY(null, CPUFeature.AVX),
 903         AVX2_256ONLY(null, CPUFeature.AVX2),
 904         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 905         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 906         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 907         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 908         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 909         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null);
 910 
 911         private final CPUFeature l128feature;
 912         private final CPUFeature l256feature;
 913 
 914         private final RegisterCategory rCategory;
 915         private final RegisterCategory vCategory;
 916         private final RegisterCategory mCategory;
 917         private final RegisterCategory imm8Category;
 918 
 919         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 920             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 921         }
 922 
 923         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 924             this.l128feature = l128feature;
 925             this.l256feature = l256feature;
 926             this.rCategory = rCategory;
 927             this.vCategory = vCategory;
 928             this.mCategory = mCategory;
 929             this.imm8Category = imm8Category;
 930         }
 931 
 932         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 933             return check(arch, getLFlag(size), r, v, m, null);
 934         }
 935 
 936         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 937             return check(arch, getLFlag(size), r, v, m, imm8);
 938         }
 939 
 940         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 941             switch (l) {
 942                 case L128:
 943                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 944                     break;
 945                 case L256:
 946                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 947                     break;
 948             }
 949             if (r != null) {
 950                 assert r.getRegisterCategory().equals(rCategory);
 951             }
 952             if (v != null) {
 953                 assert v.getRegisterCategory().equals(vCategory);
 954             }
 955             if (m != null) {
 956                 assert m.getRegisterCategory().equals(mCategory);
 957             }
 958             if (imm8 != null) {
 959                 assert imm8.getRegisterCategory().equals(imm8Category);
 960             }
 961             return true;
 962         }
 963 
 964         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 965             switch (avxSize) {
 966                 case XMM:
 967                     return l128feature != null && features.contains(l128feature);
 968                 case YMM:
 969                     return l256feature != null && features.contains(l256feature);
 970                 default:
 971                     throw GraalError.shouldNotReachHere();
 972             }
 973         }
 974     }
 975 
 976     /**
 977      * Base class for VEX-encoded instructions.
 978      */
 979     public static class VexOp {
 980         protected final int pp;
 981         protected final int mmmmm;
 982         protected final int w;
 983         protected final int op;
 984 
 985         private final String opcode;
 986         protected final VEXOpAssertion assertion;
 987 
 988         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 989             this.pp = pp;
 990             this.mmmmm = mmmmm;
 991             this.w = w;
 992             this.op = op;
 993             this.opcode = opcode;
 994             this.assertion = assertion;
 995         }
 996 
 997         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 998             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
 999         }
1000 
1001         @Override
1002         public String toString() {
1003             return opcode;
1004         }
1005     }
1006 
1007     /**
1008      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1009      */
1010     public static class VexRROp extends VexOp {
1011         // @formatter:off
1012         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1013         // @formatter:on
1014 
1015         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1016             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1017         }
1018 
1019         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1020             super(opcode, pp, mmmmm, w, op, assertion);
1021         }
1022 
1023         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1024             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1025             assert op != 0x1A || op != 0x5A;
1026             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1027             asm.emitByte(op);
1028             asm.emitModRM(dst, src);
1029         }
1030     }
1031 
1032     /**
1033      * VEX-encoded instructions with an operand order of RM.
1034      */
1035     public static class VexRMOp extends VexRROp {
1036         // @formatter:off
1037         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1038         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1042         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1044         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1046         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1048         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1049         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1051         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1052         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1056         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1057         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1058         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1059         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1060         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1061         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1062         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1063         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1064         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1065         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1066         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1067         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1068         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1069         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1070         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1074         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1075         // @formatter:on
1076 
1077         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1078             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1079         }
1080 
1081         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1082             super(opcode, pp, mmmmm, w, op, assertion);
1083         }
1084 
1085         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1086             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1087             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1088             asm.emitByte(op);
1089             asm.emitOperandHelper(dst, src, 0);
1090         }
1091     }
1092 
1093     /**
1094      * VEX-encoded move instructions.
1095      * <p>
1096      * These instructions have two opcodes: op is the forward move instruction with an operand order
1097      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1098      */
1099     public static final class VexMoveOp extends VexRMOp {
1100         // @formatter:off
1101         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1102         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1104         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1106         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1110         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         // @formatter:on
1112 
1113         private final int opReverse;
1114 
1115         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1116             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1117         }
1118 
1119         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1120             super(opcode, pp, mmmmm, w, op, assertion);
1121             this.opReverse = opReverse;
1122         }
1123 
1124         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1125             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1126             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1127             asm.emitByte(opReverse);
1128             asm.emitOperandHelper(src, dst, 0);
1129         }
1130 
1131         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1132             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1133             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1134             asm.emitByte(opReverse);
1135             asm.emitModRM(src, dst);
1136         }
1137     }
1138 
1139     public interface VexRRIOp {
1140         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1141     }
1142 
1143     /**
1144      * VEX-encoded instructions with an operand order of RMI.
1145      */
1146     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1147         // @formatter:off
1148         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1149         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1150         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         // @formatter:on
1153 
1154         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1155             super(opcode, pp, mmmmm, w, op, assertion);
1156         }
1157 
1158         @Override
1159         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1160             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1161             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1162             asm.emitByte(op);
1163             asm.emitModRM(dst, src);
1164             asm.emitByte(imm8);
1165         }
1166 
1167         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1168             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1169             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1170             asm.emitByte(op);
1171             asm.emitOperandHelper(dst, src, 1);
1172             asm.emitByte(imm8);
1173         }
1174     }
1175 
1176     /**
1177      * VEX-encoded instructions with an operand order of MRI.
1178      */
1179     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1180         // @formatter:off
1181         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1182         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1183         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1184         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1187         // @formatter:on
1188 
1189         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1190             super(opcode, pp, mmmmm, w, op, assertion);
1191         }
1192 
1193         @Override
1194         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1195             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1196             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1197             asm.emitByte(op);
1198             asm.emitModRM(src, dst);
1199             asm.emitByte(imm8);
1200         }
1201 
1202         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1203             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1204             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1205             asm.emitByte(op);
1206             asm.emitOperandHelper(src, dst, 1);
1207             asm.emitByte(imm8);
1208         }
1209     }
1210 
1211     /**
1212      * VEX-encoded instructions with an operand order of RVMR.
1213      */
1214     public static class VexRVMROp extends VexOp {
1215         // @formatter:off
1216         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1217         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1218         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1219         // @formatter:on
1220 
1221         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1222             super(opcode, pp, mmmmm, w, op, assertion);
1223         }
1224 
1225         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1226             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1227             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1228             asm.emitByte(op);
1229             asm.emitModRM(dst, src2);
1230             asm.emitByte(mask.encoding() << 4);
1231         }
1232 
1233         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1234             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1235             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1236             asm.emitByte(op);
1237             asm.emitOperandHelper(dst, src2, 0);
1238             asm.emitByte(mask.encoding() << 4);
1239         }
1240     }
1241 
1242     /**
1243      * VEX-encoded instructions with an operand order of RVM.
1244      */
1245     public static class VexRVMOp extends VexOp {
1246         // @formatter:off
1247         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1248         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1250         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1252         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1254         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1256         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1260         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1264         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1268         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1272         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1276         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1280         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1282         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1298         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1300         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1304         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1311         // @formatter:on
1312 
1313         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1314             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1315         }
1316 
1317         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1318             super(opcode, pp, mmmmm, w, op, assertion);
1319         }
1320 
1321         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1322             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1323             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1324             asm.emitByte(op);
1325             asm.emitModRM(dst, src2);
1326         }
1327 
1328         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1329             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1330             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1331             asm.emitByte(op);
1332             asm.emitOperandHelper(dst, src2, 0);
1333         }
1334     }
1335 
1336     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1337         // @formatter:off
1338         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1339         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1340         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1341         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1342         // @formatter:on
1343 
1344         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1345             super(opcode, pp, mmmmm, w, op, assertion);
1346         }
1347 
1348         @Override
1349         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1350             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1351             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1352             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1353             asm.emitByte(op);
1354             asm.emitModRM(dst, src2);
1355         }
1356 
1357         @Override
1358         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1359             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1360             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1361             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1362             asm.emitByte(op);
1363             asm.emitOperandHelper(dst, src2, 0);
1364         }
1365     }
1366 
1367     public static final class VexGeneralPurposeRMVOp extends VexOp {
1368         // @formatter:off
1369         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1370         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1371         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1372         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1373         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1374         // @formatter:on
1375 
1376         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1377             super(opcode, pp, mmmmm, w, op, assertion);
1378         }
1379 
1380         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1381             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1382             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1383             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1384             asm.emitByte(op);
1385             asm.emitModRM(dst, src1);
1386         }
1387 
1388         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1389             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1390             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1391             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1392             asm.emitByte(op);
1393             asm.emitOperandHelper(dst, src1, 0);
1394         }
1395     }
1396 
1397     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1398         // @formatter:off
1399         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1400         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1401         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1402         // @formatter:on
1403         private final int ext;
1404 
1405         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1406             super(opcode, pp, mmmmm, w, op, assertion);
1407             this.ext = ext;
1408         }
1409 
1410         @Override
1411         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1412             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1413             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1414             asm.emitByte(op);
1415             asm.emitModRM(ext, src);
1416         }
1417 
1418         @Override
1419         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1420             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1421             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1422             asm.emitByte(op);
1423             asm.emitOperandHelper(ext, src, 0);
1424         }
1425     }
1426 
1427     /**
1428      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1429      */
1430     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1431         // @formatter:off
1432         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1433         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1434         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1435         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1436         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1437         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1438         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1439         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1440         // @formatter:on
1441 
1442         private final int immOp;
1443         private final int r;
1444 
1445         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1446             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1447             this.immOp = immOp;
1448             this.r = r;
1449         }
1450 
1451         @Override
1452         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1453             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1454             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1455             asm.emitByte(immOp);
1456             asm.emitModRM(r, src);
1457             asm.emitByte(imm8);
1458         }
1459     }
1460 
1461     public static final class VexMaskMoveOp extends VexOp {
1462         // @formatter:off
1463         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1464         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1465         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1466         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1467         // @formatter:on
1468 
1469         private final int opReverse;
1470 
1471         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1472             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1473         }
1474 
1475         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1476             super(opcode, pp, mmmmm, w, op, assertion);
1477             this.opReverse = opReverse;
1478         }
1479 
1480         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1481             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1482             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1483             asm.emitByte(op);
1484             asm.emitOperandHelper(dst, src, 0);
1485         }
1486 
1487         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1488             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1489             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1490             asm.emitByte(opReverse);
1491             asm.emitOperandHelper(src, dst, 0);
1492         }
1493     }
1494 
1495     /**
1496      * VEX-encoded instructions with an operand order of RVMI.
1497      */
1498     public static final class VexRVMIOp extends VexOp {
1499         // @formatter:off
1500         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1501         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1502         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1503         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1504         // @formatter:on
1505 
1506         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1507             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1508         }
1509 
1510         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1511             super(opcode, pp, mmmmm, w, op, assertion);
1512         }
1513 
1514         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1515             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1516             assert (imm8 & 0xFF) == imm8;
1517             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1518             asm.emitByte(op);
1519             asm.emitModRM(dst, src2);
1520             asm.emitByte(imm8);
1521         }
1522 
1523         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1524             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1525             assert (imm8 & 0xFF) == imm8;
1526             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1527             asm.emitByte(op);
1528             asm.emitOperandHelper(dst, src2, 1);
1529             asm.emitByte(imm8);
1530         }
1531     }
1532 
1533     /**
1534      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1535      * comparison operator.
1536      */
1537     public static final class VexFloatCompareOp extends VexOp {
1538         // @formatter:off
1539         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1540         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1541         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1542         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1543         // @formatter:on
1544 
1545         public enum Predicate {
1546             EQ_OQ(0x00),
1547             LT_OS(0x01),
1548             LE_OS(0x02),
1549             UNORD_Q(0x03),
1550             NEQ_UQ(0x04),
1551             NLT_US(0x05),
1552             NLE_US(0x06),
1553             ORD_Q(0x07),
1554             EQ_UQ(0x08),
1555             NGE_US(0x09),
1556             NGT_US(0x0a),
1557             FALSE_OQ(0x0b),
1558             NEQ_OQ(0x0c),
1559             GE_OS(0x0d),
1560             GT_OS(0x0e),
1561             TRUE_UQ(0x0f),
1562             EQ_OS(0x10),
1563             LT_OQ(0x11),
1564             LE_OQ(0x12),
1565             UNORD_S(0x13),
1566             NEQ_US(0x14),
1567             NLT_UQ(0x15),
1568             NLE_UQ(0x16),
1569             ORD_S(0x17),
1570             EQ_US(0x18),
1571             NGE_UQ(0x19),
1572             NGT_UQ(0x1a),
1573             FALSE_OS(0x1b),
1574             NEQ_OS(0x1c),
1575             GE_OQ(0x1d),
1576             GT_OQ(0x1e),
1577             TRUE_US(0x1f);
1578 
1579             private int imm8;
1580 
1581             Predicate(int imm8) {
1582                 this.imm8 = imm8;
1583             }
1584 
1585             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1586                 if (unorderedIsTrue) {
1587                     switch (condition) {
1588                         case EQ:
1589                             return EQ_UQ;
1590                         case NE:
1591                             return NEQ_UQ;
1592                         case LT:
1593                             return NGE_UQ;
1594                         case LE:
1595                             return NGT_UQ;
1596                         case GT:
1597                             return NLE_UQ;
1598                         case GE:
1599                             return NLT_UQ;
1600                         default:
1601                             throw GraalError.shouldNotReachHere();
1602                     }
1603                 } else {
1604                     switch (condition) {
1605                         case EQ:
1606                             return EQ_OQ;
1607                         case NE:
1608                             return NEQ_OQ;
1609                         case LT:
1610                             return LT_OQ;
1611                         case LE:
1612                             return LE_OQ;
1613                         case GT:
1614                             return GT_OQ;
1615                         case GE:
1616                             return GE_OQ;
1617                         default:
1618                             throw GraalError.shouldNotReachHere();
1619                     }
1620                 }
1621             }
1622         }
1623 
1624         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1625             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1626         }
1627 
1628         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1629             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1630             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1631             asm.emitByte(op);
1632             asm.emitModRM(dst, src2);
1633             asm.emitByte(p.imm8);
1634         }
1635 
1636         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1637             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1638             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1639             asm.emitByte(op);
1640             asm.emitOperandHelper(dst, src2, 1);
1641             asm.emitByte(p.imm8);
1642         }
1643     }
1644 
1645     public final void addl(AMD64Address dst, int imm32) {
1646         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1647     }
1648 
1649     public final void addl(Register dst, int imm32) {
1650         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1651     }
1652 
1653     public final void addl(Register dst, Register src) {
1654         ADD.rmOp.emit(this, DWORD, dst, src);
1655     }
1656 
1657     public final void addpd(Register dst, Register src) {
1658         SSEOp.ADD.emit(this, PD, dst, src);
1659     }
1660 
1661     public final void addpd(Register dst, AMD64Address src) {
1662         SSEOp.ADD.emit(this, PD, dst, src);
1663     }
1664 
1665     public final void addsd(Register dst, Register src) {
1666         SSEOp.ADD.emit(this, SD, dst, src);
1667     }
1668 
1669     public final void addsd(Register dst, AMD64Address src) {
1670         SSEOp.ADD.emit(this, SD, dst, src);
1671     }
1672 
1673     private void addrNop4() {
1674         // 4 bytes: NOP DWORD PTR [EAX+0]
1675         emitByte(0x0F);
1676         emitByte(0x1F);
1677         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1678         emitByte(0); // 8-bits offset (1 byte)
1679     }
1680 
1681     private void addrNop5() {
1682         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1683         emitByte(0x0F);
1684         emitByte(0x1F);
1685         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1686         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1687         emitByte(0); // 8-bits offset (1 byte)
1688     }
1689 
1690     private void addrNop7() {
1691         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1692         emitByte(0x0F);
1693         emitByte(0x1F);
1694         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1695         emitInt(0); // 32-bits offset (4 bytes)
1696     }
1697 
1698     private void addrNop8() {
1699         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1700         emitByte(0x0F);
1701         emitByte(0x1F);
1702         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1703         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1704         emitInt(0); // 32-bits offset (4 bytes)
1705     }
1706 
1707     public final void andl(Register dst, int imm32) {
1708         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1709     }
1710 
1711     public final void andl(Register dst, Register src) {
1712         AND.rmOp.emit(this, DWORD, dst, src);
1713     }
1714 
1715     public final void andpd(Register dst, Register src) {
1716         SSEOp.AND.emit(this, PD, dst, src);
1717     }
1718 
1719     public final void andpd(Register dst, AMD64Address src) {
1720         SSEOp.AND.emit(this, PD, dst, src);
1721     }
1722 
1723     public final void bsfq(Register dst, Register src) {
1724         prefixq(dst, src);
1725         emitByte(0x0F);
1726         emitByte(0xBC);
1727         emitModRM(dst, src);
1728     }
1729 
1730     public final void bsrl(Register dst, Register src) {
1731         prefix(dst, src);
1732         emitByte(0x0F);
1733         emitByte(0xBD);
1734         emitModRM(dst, src);
1735     }
1736 
1737     public final void bswapl(Register reg) {
1738         prefix(reg);
1739         emitByte(0x0F);
1740         emitModRM(1, reg);
1741     }
1742 
1743     public final void cdql() {
1744         emitByte(0x99);
1745     }
1746 
1747     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1748         prefix(dst, src);
1749         emitByte(0x0F);
1750         emitByte(0x40 | cc.getValue());
1751         emitModRM(dst, src);
1752     }
1753 
1754     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1755         prefix(src, dst);
1756         emitByte(0x0F);
1757         emitByte(0x40 | cc.getValue());
1758         emitOperandHelper(dst, src, 0);
1759     }
1760 
1761     public final void cmpb(Register dst, Register src) {
1762         CMP.byteRmOp.emit(this, BYTE, dst, src);
1763     }
1764 
1765     public final void cmpw(Register dst, Register src) {
1766         CMP.rmOp.emit(this, WORD, dst, src);
1767     }
1768 
1769     public final void cmpl(Register dst, int imm32) {
1770         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1771     }
1772 
1773     public final void cmpl(Register dst, Register src) {
1774         CMP.rmOp.emit(this, DWORD, dst, src);
1775     }
1776 
1777     public final void cmpl(Register dst, AMD64Address src) {
1778         CMP.rmOp.emit(this, DWORD, dst, src);
1779     }
1780 
1781     public final void cmpl(AMD64Address dst, int imm32) {
1782         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1783     }
1784 
1785     /**
1786      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1787      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1788      * values were equal, and cleared otherwise.
1789      */
1790     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1791         prefixb(adr, reg);
1792         emitByte(0x0F);
1793         emitByte(0xB0);
1794         emitOperandHelper(reg, adr, 0);
1795     }
1796 
1797     /**
1798      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1799      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1800      * compared values were equal, and cleared otherwise.
1801      */
1802     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1803         emitByte(0x66); // Switch to 16-bit mode.
1804         prefix(adr, reg);
1805         emitByte(0x0F);
1806         emitByte(0xB1);
1807         emitOperandHelper(reg, adr, 0);
1808     }
1809 
1810     /**
1811      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1812      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1813      * compared values were equal, and cleared otherwise.
1814      */
1815     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1816         prefix(adr, reg);
1817         emitByte(0x0F);
1818         emitByte(0xB1);
1819         emitOperandHelper(reg, adr, 0);
1820     }
1821 
1822     public final void cvtsi2sdl(Register dst, Register src) {
1823         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1824     }
1825 
1826     public final void cvttsd2sil(Register dst, Register src) {
1827         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1828     }
1829 
1830     public final void decl(AMD64Address dst) {
1831         prefix(dst);
1832         emitByte(0xFF);
1833         emitOperandHelper(1, dst, 0);
1834     }
1835 
1836     public final void divsd(Register dst, Register src) {
1837         SSEOp.DIV.emit(this, SD, dst, src);
1838     }
1839 
1840     public final void hlt() {
1841         emitByte(0xF4);
1842     }
1843 
1844     public final void imull(Register dst, Register src, int value) {
1845         if (isByte(value)) {
1846             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1847         } else {
1848             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1849         }
1850     }
1851 
1852     public final void incl(AMD64Address dst) {
1853         prefix(dst);
1854         emitByte(0xFF);
1855         emitOperandHelper(0, dst, 0);
1856     }
1857 
1858     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1859         int shortSize = 2;
1860         int longSize = 6;
1861         long disp = jumpTarget - position();
1862         if (!forceDisp32 && isByte(disp - shortSize)) {
1863             // 0111 tttn #8-bit disp
1864             emitByte(0x70 | cc.getValue());
1865             emitByte((int) ((disp - shortSize) & 0xFF));
1866         } else {
1867             // 0000 1111 1000 tttn #32-bit disp
1868             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1869             emitByte(0x0F);
1870             emitByte(0x80 | cc.getValue());
1871             emitInt((int) (disp - longSize));
1872         }
1873     }
1874 
1875     public final void jcc(ConditionFlag cc, Label l) {
1876         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1877         if (l.isBound()) {
1878             jcc(cc, l.position(), false);
1879         } else {
1880             // Note: could eliminate cond. jumps to this jump if condition
1881             // is the same however, seems to be rather unlikely case.
1882             // Note: use jccb() if label to be bound is very close to get
1883             // an 8-bit displacement
1884             l.addPatchAt(position(), this);
1885             emitByte(0x0F);
1886             emitByte(0x80 | cc.getValue());
1887             emitInt(0);
1888         }
1889 
1890     }
1891 
1892     public final void jccb(ConditionFlag cc, Label l) {
1893         if (l.isBound()) {
1894             int shortSize = 2;
1895             int entry = l.position();
1896             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1897             long disp = entry - position();
1898             // 0111 tttn #8-bit disp
1899             emitByte(0x70 | cc.getValue());
1900             emitByte((int) ((disp - shortSize) & 0xFF));
1901         } else {
1902             l.addPatchAt(position(), this);
1903             emitByte(0x70 | cc.getValue());
1904             emitByte(0);
1905         }
1906     }
1907 
1908     public final void jmp(int jumpTarget, boolean forceDisp32) {
1909         int shortSize = 2;
1910         int longSize = 5;
1911         long disp = jumpTarget - position();
1912         if (!forceDisp32 && isByte(disp - shortSize)) {
1913             emitByte(0xEB);
1914             emitByte((int) ((disp - shortSize) & 0xFF));
1915         } else {
1916             emitByte(0xE9);
1917             emitInt((int) (disp - longSize));
1918         }
1919     }
1920 
1921     @Override
1922     public final void jmp(Label l) {
1923         if (l.isBound()) {
1924             jmp(l.position(), false);
1925         } else {
1926             // By default, forward jumps are always 32-bit displacements, since
1927             // we can't yet know where the label will be bound. If you're sure that
1928             // the forward jump will not run beyond 256 bytes, use jmpb to
1929             // force an 8-bit displacement.
1930 
1931             l.addPatchAt(position(), this);
1932             emitByte(0xE9);
1933             emitInt(0);
1934         }
1935     }
1936 
1937     public final void jmp(Register entry) {
1938         prefix(entry);
1939         emitByte(0xFF);
1940         emitModRM(4, entry);
1941     }
1942 
1943     public final void jmp(AMD64Address adr) {
1944         prefix(adr);
1945         emitByte(0xFF);
1946         emitOperandHelper(AMD64.rsp, adr, 0);
1947     }
1948 
1949     public final void jmpb(Label l) {
1950         if (l.isBound()) {
1951             int shortSize = 2;
1952             // Displacement is relative to byte just after jmpb instruction
1953             int displacement = l.position() - position() - shortSize;
1954             GraalError.guarantee(isByte(displacement), "Displacement too large to be encoded as a byte: %d", displacement);
1955             emitByte(0xEB);
1956             emitByte(displacement & 0xFF);
1957         } else {
1958             l.addPatchAt(position(), this);
1959             emitByte(0xEB);
1960             emitByte(0);
1961         }
1962     }
1963 
1964     public final void lead(Register dst, AMD64Address src) {
1965         prefix(src, dst);
1966         emitByte(0x8D);
1967         emitOperandHelper(dst, src, 0);
1968     }
1969 
1970     public final void leaq(Register dst, AMD64Address src) {
1971         prefixq(src, dst);
1972         emitByte(0x8D);
1973         emitOperandHelper(dst, src, 0);
1974     }
1975 
1976     public final void leave() {
1977         emitByte(0xC9);
1978     }
1979 
1980     public final void lock() {
1981         emitByte(0xF0);
1982     }
1983 
1984     public final void movapd(Register dst, Register src) {
1985         assert inRC(XMM, dst) && inRC(XMM, src);
1986         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1987         emitByte(0x28);
1988         emitModRM(dst, src);
1989     }
1990 
1991     public final void movaps(Register dst, Register src) {
1992         assert inRC(XMM, dst) && inRC(XMM, src);
1993         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1994         emitByte(0x28);
1995         emitModRM(dst, src);
1996     }
1997 
1998     public final void movb(AMD64Address dst, int imm8) {
1999         prefix(dst);
2000         emitByte(0xC6);
2001         emitOperandHelper(0, dst, 1);
2002         emitByte(imm8);
2003     }
2004 
2005     public final void movb(AMD64Address dst, Register src) {
2006         assert inRC(CPU, src) : "must have byte register";
2007         prefixb(dst, src);
2008         emitByte(0x88);
2009         emitOperandHelper(src, dst, 0);
2010     }
2011 
2012     public final void movl(Register dst, int imm32) {
2013         movl(dst, imm32, false);
2014     }
2015 
2016     public final void movl(Register dst, int imm32, boolean annotateImm) {
2017         int insnPos = position();
2018         prefix(dst);
2019         emitByte(0xB8 + encode(dst));
2020         int immPos = position();
2021         emitInt(imm32);
2022         int nextInsnPos = position();
2023         if (annotateImm && codePatchingAnnotationConsumer != null) {
2024             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2025         }
2026     }
2027 
2028     public final void movl(Register dst, Register src) {
2029         prefix(dst, src);
2030         emitByte(0x8B);
2031         emitModRM(dst, src);
2032     }
2033 
2034     public final void movl(Register dst, AMD64Address src) {
2035         prefix(src, dst);
2036         emitByte(0x8B);
2037         emitOperandHelper(dst, src, 0);
2038     }
2039 
2040     /**
2041      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2042      */
2043     public final void movl(Register dst, AMD64Address src, boolean wide) {
2044         prefix(src, dst);
2045         emitByte(0x8B);
2046         emitOperandHelper(dst, src, wide, 0);
2047     }
2048 
2049     public final void movl(AMD64Address dst, int imm32) {
2050         prefix(dst);
2051         emitByte(0xC7);
2052         emitOperandHelper(0, dst, 4);
2053         emitInt(imm32);
2054     }
2055 
2056     public final void movl(AMD64Address dst, Register src) {
2057         prefix(dst, src);
2058         emitByte(0x89);
2059         emitOperandHelper(src, dst, 0);
2060     }
2061 
2062     /**
2063      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2064      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2065      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2066      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2067      */
2068     public final void movlpd(Register dst, AMD64Address src) {
2069         assert inRC(XMM, dst);
2070         simdPrefix(dst, dst, src, PD, P_0F, false);
2071         emitByte(0x12);
2072         emitOperandHelper(dst, src, 0);
2073     }
2074 
2075     public final void movlhps(Register dst, Register src) {
2076         assert inRC(XMM, dst) && inRC(XMM, src);
2077         simdPrefix(dst, src, src, PS, P_0F, false);
2078         emitByte(0x16);
2079         emitModRM(dst, src);
2080     }
2081 
2082     public final void movq(Register dst, AMD64Address src) {
2083         movq(dst, src, false);
2084     }
2085 
2086     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2087         if (inRC(XMM, dst)) {
2088             // Insn: MOVQ xmm, r/m64
2089             // Code: F3 0F 7E /r
2090             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2091             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2092             // when applicable.
2093             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2094             emitByte(0x7E);
2095             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2096         } else {
2097             // gpr version of movq
2098             prefixq(src, dst);
2099             emitByte(0x8B);
2100             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2101         }
2102     }
2103 
2104     public final void movq(Register dst, Register src) {
2105         assert inRC(CPU, dst) && inRC(CPU, src);
2106         prefixq(dst, src);
2107         emitByte(0x8B);
2108         emitModRM(dst, src);
2109     }
2110 
2111     public final void movq(AMD64Address dst, Register src) {
2112         if (inRC(XMM, src)) {
2113             // Insn: MOVQ r/m64, xmm
2114             // Code: 66 0F D6 /r
2115             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2116             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2117             // when applicable.
2118             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2119             emitByte(0xD6);
2120             emitOperandHelper(src, dst, 0);
2121         } else {
2122             // gpr version of movq
2123             prefixq(dst, src);
2124             emitByte(0x89);
2125             emitOperandHelper(src, dst, 0);
2126         }
2127     }
2128 
2129     public final void movsbl(Register dst, AMD64Address src) {
2130         prefix(src, dst);
2131         emitByte(0x0F);
2132         emitByte(0xBE);
2133         emitOperandHelper(dst, src, 0);
2134     }
2135 
2136     public final void movsbl(Register dst, Register src) {
2137         prefix(dst, false, src, true);
2138         emitByte(0x0F);
2139         emitByte(0xBE);
2140         emitModRM(dst, src);
2141     }
2142 
2143     public final void movsbq(Register dst, AMD64Address src) {
2144         prefixq(src, dst);
2145         emitByte(0x0F);
2146         emitByte(0xBE);
2147         emitOperandHelper(dst, src, 0);
2148     }
2149 
2150     public final void movsbq(Register dst, Register src) {
2151         prefixq(dst, src);
2152         emitByte(0x0F);
2153         emitByte(0xBE);
2154         emitModRM(dst, src);
2155     }
2156 
2157     public final void movsd(Register dst, Register src) {
2158         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2159     }
2160 
2161     public final void movsd(Register dst, AMD64Address src) {
2162         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2163     }
2164 
2165     public final void movsd(AMD64Address dst, Register src) {
2166         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2167     }
2168 
2169     public final void movss(Register dst, Register src) {
2170         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2171     }
2172 
2173     public final void movss(Register dst, AMD64Address src) {
2174         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2175     }
2176 
2177     public final void movss(AMD64Address dst, Register src) {
2178         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2179     }
2180 
2181     public final void mulpd(Register dst, Register src) {
2182         SSEOp.MUL.emit(this, PD, dst, src);
2183     }
2184 
2185     public final void mulpd(Register dst, AMD64Address src) {
2186         SSEOp.MUL.emit(this, PD, dst, src);
2187     }
2188 
2189     public final void mulsd(Register dst, Register src) {
2190         SSEOp.MUL.emit(this, SD, dst, src);
2191     }
2192 
2193     public final void mulsd(Register dst, AMD64Address src) {
2194         SSEOp.MUL.emit(this, SD, dst, src);
2195     }
2196 
2197     public final void mulss(Register dst, Register src) {
2198         SSEOp.MUL.emit(this, SS, dst, src);
2199     }
2200 
2201     public final void movswl(Register dst, AMD64Address src) {
2202         AMD64RMOp.MOVSX.emit(this, DWORD, dst, src);
2203     }
2204 
2205     public final void movswq(Register dst, AMD64Address src) {
2206         AMD64RMOp.MOVSX.emit(this, QWORD, dst, src);
2207     }
2208 
2209     public final void movw(AMD64Address dst, int imm16) {
2210         emitByte(0x66); // switch to 16-bit mode
2211         prefix(dst);
2212         emitByte(0xC7);
2213         emitOperandHelper(0, dst, 2);
2214         emitShort(imm16);
2215     }
2216 
2217     public final void movw(AMD64Address dst, Register src) {
2218         emitByte(0x66);
2219         prefix(dst, src);
2220         emitByte(0x89);
2221         emitOperandHelper(src, dst, 0);
2222     }
2223 
2224     public final void movw(Register dst, AMD64Address src) {
2225         emitByte(0x66);
2226         prefix(src, dst);
2227         emitByte(0x8B);
2228         emitOperandHelper(dst, src, 0);
2229     }
2230 
2231     public final void movzbl(Register dst, AMD64Address src) {
2232         prefix(src, dst);
2233         emitByte(0x0F);
2234         emitByte(0xB6);
2235         emitOperandHelper(dst, src, 0);
2236     }
2237 
2238     public final void movzbl(Register dst, Register src) {
2239         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2240     }
2241 
2242     public final void movzbq(Register dst, Register src) {
2243         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2244     }
2245 
2246     public final void movzbq(Register dst, AMD64Address src) {
2247         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2248     }
2249 
2250     public final void movzwl(Register dst, AMD64Address src) {
2251         AMD64RMOp.MOVZX.emit(this, DWORD, dst, src);
2252     }
2253 
2254     public final void movzwq(Register dst, AMD64Address src) {
2255         AMD64RMOp.MOVZX.emit(this, QWORD, dst, src);
2256     }
2257 
2258     public final void negl(Register dst) {
2259         NEG.emit(this, DWORD, dst);
2260     }
2261 
2262     public final void notl(Register dst) {
2263         NOT.emit(this, DWORD, dst);
2264     }
2265 
2266     public final void notq(Register dst) {
2267         NOT.emit(this, QWORD, dst);
2268     }
2269 
2270     @Override
2271     public final void ensureUniquePC() {
2272         nop();
2273     }
2274 
2275     public final void nop() {
2276         nop(1);
2277     }
2278 
2279     public void nop(int count) {
2280         int i = count;
2281         if (UseNormalNop) {
2282             assert i > 0 : " ";
2283             // The fancy nops aren't currently recognized by debuggers making it a
2284             // pain to disassemble code while debugging. If assert are on clearly
2285             // speed is not an issue so simply use the single byte traditional nop
2286             // to do alignment.
2287 
2288             for (; i > 0; i--) {
2289                 emitByte(0x90);
2290             }
2291             return;
2292         }
2293 
2294         if (UseAddressNop) {
2295             //
2296             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2297             // 1: 0x90
2298             // 2: 0x66 0x90
2299             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2300             // 4: 0x0F 0x1F 0x40 0x00
2301             // 5: 0x0F 0x1F 0x44 0x00 0x00
2302             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2303             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2304             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2305             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2306             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2307             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2308 
2309             // The rest coding is AMD specific - use consecutive Address nops
2310 
2311             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2312             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2313             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2314             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2315             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2316             // Size prefixes (0x66) are added for larger sizes
2317 
2318             while (i >= 22) {
2319                 i -= 11;
2320                 emitByte(0x66); // size prefix
2321                 emitByte(0x66); // size prefix
2322                 emitByte(0x66); // size prefix
2323                 addrNop8();
2324             }
2325             // Generate first nop for size between 21-12
2326             switch (i) {
2327                 case 21:
2328                     i -= 11;
2329                     emitByte(0x66); // size prefix
2330                     emitByte(0x66); // size prefix
2331                     emitByte(0x66); // size prefix
2332                     addrNop8();
2333                     break;
2334                 case 20:
2335                 case 19:
2336                     i -= 10;
2337                     emitByte(0x66); // size prefix
2338                     emitByte(0x66); // size prefix
2339                     addrNop8();
2340                     break;
2341                 case 18:
2342                 case 17:
2343                     i -= 9;
2344                     emitByte(0x66); // size prefix
2345                     addrNop8();
2346                     break;
2347                 case 16:
2348                 case 15:
2349                     i -= 8;
2350                     addrNop8();
2351                     break;
2352                 case 14:
2353                 case 13:
2354                     i -= 7;
2355                     addrNop7();
2356                     break;
2357                 case 12:
2358                     i -= 6;
2359                     emitByte(0x66); // size prefix
2360                     addrNop5();
2361                     break;
2362                 default:
2363                     assert i < 12;
2364             }
2365 
2366             // Generate second nop for size between 11-1
2367             switch (i) {
2368                 case 11:
2369                     emitByte(0x66); // size prefix
2370                     emitByte(0x66); // size prefix
2371                     emitByte(0x66); // size prefix
2372                     addrNop8();
2373                     break;
2374                 case 10:
2375                     emitByte(0x66); // size prefix
2376                     emitByte(0x66); // size prefix
2377                     addrNop8();
2378                     break;
2379                 case 9:
2380                     emitByte(0x66); // size prefix
2381                     addrNop8();
2382                     break;
2383                 case 8:
2384                     addrNop8();
2385                     break;
2386                 case 7:
2387                     addrNop7();
2388                     break;
2389                 case 6:
2390                     emitByte(0x66); // size prefix
2391                     addrNop5();
2392                     break;
2393                 case 5:
2394                     addrNop5();
2395                     break;
2396                 case 4:
2397                     addrNop4();
2398                     break;
2399                 case 3:
2400                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2401                     emitByte(0x66); // size prefix
2402                     emitByte(0x66); // size prefix
2403                     emitByte(0x90); // nop
2404                     break;
2405                 case 2:
2406                     emitByte(0x66); // size prefix
2407                     emitByte(0x90); // nop
2408                     break;
2409                 case 1:
2410                     emitByte(0x90); // nop
2411                     break;
2412                 default:
2413                     assert i == 0;
2414             }
2415             return;
2416         }
2417 
2418         // Using nops with size prefixes "0x66 0x90".
2419         // From AMD Optimization Guide:
2420         // 1: 0x90
2421         // 2: 0x66 0x90
2422         // 3: 0x66 0x66 0x90
2423         // 4: 0x66 0x66 0x66 0x90
2424         // 5: 0x66 0x66 0x90 0x66 0x90
2425         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2426         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2427         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2428         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2429         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2430         //
2431         while (i > 12) {
2432             i -= 4;
2433             emitByte(0x66); // size prefix
2434             emitByte(0x66);
2435             emitByte(0x66);
2436             emitByte(0x90); // nop
2437         }
2438         // 1 - 12 nops
2439         if (i > 8) {
2440             if (i > 9) {
2441                 i -= 1;
2442                 emitByte(0x66);
2443             }
2444             i -= 3;
2445             emitByte(0x66);
2446             emitByte(0x66);
2447             emitByte(0x90);
2448         }
2449         // 1 - 8 nops
2450         if (i > 4) {
2451             if (i > 6) {
2452                 i -= 1;
2453                 emitByte(0x66);
2454             }
2455             i -= 3;
2456             emitByte(0x66);
2457             emitByte(0x66);
2458             emitByte(0x90);
2459         }
2460         switch (i) {
2461             case 4:
2462                 emitByte(0x66);
2463                 emitByte(0x66);
2464                 emitByte(0x66);
2465                 emitByte(0x90);
2466                 break;
2467             case 3:
2468                 emitByte(0x66);
2469                 emitByte(0x66);
2470                 emitByte(0x90);
2471                 break;
2472             case 2:
2473                 emitByte(0x66);
2474                 emitByte(0x90);
2475                 break;
2476             case 1:
2477                 emitByte(0x90);
2478                 break;
2479             default:
2480                 assert i == 0;
2481         }
2482     }
2483 
2484     public final void orl(Register dst, Register src) {
2485         OR.rmOp.emit(this, DWORD, dst, src);
2486     }
2487 
2488     public final void orl(Register dst, int imm32) {
2489         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2490     }
2491 
2492     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2493     // -----
2494     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2495 
2496     public final void packuswb(Register dst, Register src) {
2497         assert inRC(XMM, dst) && inRC(XMM, src);
2498         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2499         simdPrefix(dst, dst, src, PD, P_0F, false);
2500         emitByte(0x67);
2501         emitModRM(dst, src);
2502     }
2503 
2504     public final void pop(Register dst) {
2505         prefix(dst);
2506         emitByte(0x58 + encode(dst));
2507     }
2508 
2509     public void popfq() {
2510         emitByte(0x9D);
2511     }
2512 
2513     public final void ptest(Register dst, Register src) {
2514         assert supports(CPUFeature.SSE4_1);
2515         assert inRC(XMM, dst) && inRC(XMM, src);
2516         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2517         emitByte(0x17);
2518         emitModRM(dst, src);
2519     }
2520 
2521     public final void pcmpeqb(Register dst, Register src) {
2522         assert supports(CPUFeature.SSE2);
2523         assert inRC(XMM, dst) && inRC(XMM, src);
2524         simdPrefix(dst, dst, src, PD, P_0F, false);
2525         emitByte(0x74);
2526         emitModRM(dst, src);
2527     }
2528 
2529     public final void pcmpeqw(Register dst, Register src) {
2530         assert supports(CPUFeature.SSE2);
2531         assert inRC(XMM, dst) && inRC(XMM, src);
2532         simdPrefix(dst, dst, src, PD, P_0F, false);
2533         emitByte(0x75);
2534         emitModRM(dst, src);
2535     }
2536 
2537     public final void pcmpeqd(Register dst, Register src) {
2538         assert supports(CPUFeature.SSE2);
2539         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2540         simdPrefix(dst, dst, src, PD, P_0F, false);
2541         emitByte(0x76);
2542         emitModRM(dst, src);
2543     }
2544 
2545     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2546         assert supports(CPUFeature.SSE4_2);
2547         assert inRC(XMM, dst);
2548         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2549         emitByte(0x61);
2550         emitOperandHelper(dst, src, 0);
2551         emitByte(imm8);
2552     }
2553 
2554     public final void pcmpestri(Register dst, Register src, int imm8) {
2555         assert supports(CPUFeature.SSE4_2);
2556         assert inRC(XMM, dst) && inRC(XMM, src);
2557         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2558         emitByte(0x61);
2559         emitModRM(dst, src);
2560         emitByte(imm8);
2561     }
2562 
2563     public final void pmovmskb(Register dst, Register src) {
2564         assert supports(CPUFeature.SSE2);
2565         assert inRC(CPU, dst) && inRC(XMM, src);
2566         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2567         emitByte(0xD7);
2568         emitModRM(dst, src);
2569     }
2570 
2571     private void pmovSZx(Register dst, AMD64Address src, int op) {
2572         assert supports(CPUFeature.SSE4_1);
2573         assert inRC(XMM, dst);
2574         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2575         emitByte(op);
2576         emitOperandHelper(dst, src, 0);
2577     }
2578 
2579     public final void pmovsxbw(Register dst, AMD64Address src) {
2580         pmovSZx(dst, src, 0x20);
2581     }
2582 
2583     public final void pmovsxbd(Register dst, AMD64Address src) {
2584         pmovSZx(dst, src, 0x21);
2585     }
2586 
2587     public final void pmovsxbq(Register dst, AMD64Address src) {
2588         pmovSZx(dst, src, 0x22);
2589     }
2590 
2591     public final void pmovsxwd(Register dst, AMD64Address src) {
2592         pmovSZx(dst, src, 0x23);
2593     }
2594 
2595     public final void pmovsxwq(Register dst, AMD64Address src) {
2596         pmovSZx(dst, src, 0x24);
2597     }
2598 
2599     public final void pmovsxdq(Register dst, AMD64Address src) {
2600         pmovSZx(dst, src, 0x25);
2601     }
2602 
2603     // Insn: VPMOVZXBW xmm1, xmm2/m64
2604     public final void pmovzxbw(Register dst, AMD64Address src) {
2605         pmovSZx(dst, src, 0x30);
2606     }
2607 
2608     public final void pmovzxbd(Register dst, AMD64Address src) {
2609         pmovSZx(dst, src, 0x31);
2610     }
2611 
2612     public final void pmovzxbq(Register dst, AMD64Address src) {
2613         pmovSZx(dst, src, 0x32);
2614     }
2615 
2616     public final void pmovzxwd(Register dst, AMD64Address src) {
2617         pmovSZx(dst, src, 0x33);
2618     }
2619 
2620     public final void pmovzxwq(Register dst, AMD64Address src) {
2621         pmovSZx(dst, src, 0x34);
2622     }
2623 
2624     public final void pmovzxdq(Register dst, AMD64Address src) {
2625         pmovSZx(dst, src, 0x35);
2626     }
2627 
2628     public final void pmovzxbw(Register dst, Register src) {
2629         assert supports(CPUFeature.SSE4_1);
2630         assert inRC(XMM, dst) && inRC(XMM, src);
2631         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2632         emitByte(0x30);
2633         emitModRM(dst, src);
2634     }
2635 
2636     public final void push(Register src) {
2637         prefix(src);
2638         emitByte(0x50 + encode(src));
2639     }
2640 
2641     public void pushfq() {
2642         emitByte(0x9c);
2643     }
2644 
2645     public final void paddd(Register dst, Register src) {
2646         assert inRC(XMM, dst) && inRC(XMM, src);
2647         simdPrefix(dst, dst, src, PD, P_0F, false);
2648         emitByte(0xFE);
2649         emitModRM(dst, src);
2650     }
2651 
2652     public final void paddq(Register dst, Register src) {
2653         assert inRC(XMM, dst) && inRC(XMM, src);
2654         simdPrefix(dst, dst, src, PD, P_0F, false);
2655         emitByte(0xD4);
2656         emitModRM(dst, src);
2657     }
2658 
2659     public final void pextrw(Register dst, Register src, int imm8) {
2660         assert inRC(CPU, dst) && inRC(XMM, src);
2661         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2662         emitByte(0xC5);
2663         emitModRM(dst, src);
2664         emitByte(imm8);
2665     }
2666 
2667     public final void pinsrw(Register dst, Register src, int imm8) {
2668         assert inRC(XMM, dst) && inRC(CPU, src);
2669         simdPrefix(dst, dst, src, PD, P_0F, false);
2670         emitByte(0xC4);
2671         emitModRM(dst, src);
2672         emitByte(imm8);
2673     }
2674 
2675     public final void por(Register dst, Register src) {
2676         assert inRC(XMM, dst) && inRC(XMM, src);
2677         simdPrefix(dst, dst, src, PD, P_0F, false);
2678         emitByte(0xEB);
2679         emitModRM(dst, src);
2680     }
2681 
2682     public final void pand(Register dst, Register src) {
2683         assert inRC(XMM, dst) && inRC(XMM, src);
2684         simdPrefix(dst, dst, src, PD, P_0F, false);
2685         emitByte(0xDB);
2686         emitModRM(dst, src);
2687     }
2688 
2689     public final void pxor(Register dst, Register src) {
2690         assert inRC(XMM, dst) && inRC(XMM, src);
2691         simdPrefix(dst, dst, src, PD, P_0F, false);
2692         emitByte(0xEF);
2693         emitModRM(dst, src);
2694     }
2695 
2696     public final void pslld(Register dst, int imm8) {
2697         assert isUByte(imm8) : "invalid value";
2698         assert inRC(XMM, dst);
2699         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2700         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2701         emitByte(0x72);
2702         emitModRM(6, dst);
2703         emitByte(imm8 & 0xFF);
2704     }
2705 
2706     public final void psllq(Register dst, Register shift) {
2707         assert inRC(XMM, dst) && inRC(XMM, shift);
2708         simdPrefix(dst, dst, shift, PD, P_0F, false);
2709         emitByte(0xF3);
2710         emitModRM(dst, shift);
2711     }
2712 
2713     public final void psllq(Register dst, int imm8) {
2714         assert isUByte(imm8) : "invalid value";
2715         assert inRC(XMM, dst);
2716         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2717         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2718         emitByte(0x73);
2719         emitModRM(6, dst);
2720         emitByte(imm8);
2721     }
2722 
2723     public final void psrad(Register dst, int imm8) {
2724         assert isUByte(imm8) : "invalid value";
2725         assert inRC(XMM, dst);
2726         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2727         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2728         emitByte(0x72);
2729         emitModRM(4, dst);
2730         emitByte(imm8);
2731     }
2732 
2733     public final void psrld(Register dst, int imm8) {
2734         assert isUByte(imm8) : "invalid value";
2735         assert inRC(XMM, dst);
2736         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2737         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2738         emitByte(0x72);
2739         emitModRM(2, dst);
2740         emitByte(imm8);
2741     }
2742 
2743     public final void psrlq(Register dst, int imm8) {
2744         assert isUByte(imm8) : "invalid value";
2745         assert inRC(XMM, dst);
2746         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2747         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2748         emitByte(0x73);
2749         emitModRM(2, dst);
2750         emitByte(imm8);
2751     }
2752 
2753     public final void psrldq(Register dst, int imm8) {
2754         assert isUByte(imm8) : "invalid value";
2755         assert inRC(XMM, dst);
2756         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2757         emitByte(0x73);
2758         emitModRM(3, dst);
2759         emitByte(imm8);
2760     }
2761 
2762     public final void pshufb(Register dst, Register src) {
2763         assert supports(CPUFeature.SSSE3);
2764         assert inRC(XMM, dst) && inRC(XMM, src);
2765         simdPrefix(dst, dst, src, PD, P_0F38, false);
2766         emitByte(0x00);
2767         emitModRM(dst, src);
2768     }
2769 
2770     public final void pshuflw(Register dst, Register src, int imm8) {
2771         assert supports(CPUFeature.SSE2);
2772         assert isUByte(imm8) : "invalid value";
2773         assert inRC(XMM, dst) && inRC(XMM, src);
2774         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2775         emitByte(0x70);
2776         emitModRM(dst, src);
2777         emitByte(imm8);
2778     }
2779 
2780     public final void pshufd(Register dst, Register src, int imm8) {
2781         assert isUByte(imm8) : "invalid value";
2782         assert inRC(XMM, dst) && inRC(XMM, src);
2783         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2784         emitByte(0x70);
2785         emitModRM(dst, src);
2786         emitByte(imm8);
2787     }
2788 
2789     public final void psubd(Register dst, Register src) {
2790         assert inRC(XMM, dst) && inRC(XMM, src);
2791         simdPrefix(dst, dst, src, PD, P_0F, false);
2792         emitByte(0xFA);
2793         emitModRM(dst, src);
2794     }
2795 
2796     public final void punpcklbw(Register dst, Register src) {
2797         assert supports(CPUFeature.SSE2);
2798         assert inRC(XMM, dst) && inRC(XMM, src);
2799         simdPrefix(dst, dst, src, PD, P_0F, false);
2800         emitByte(0x60);
2801         emitModRM(dst, src);
2802     }
2803 
2804     public final void rcpps(Register dst, Register src) {
2805         assert inRC(XMM, dst) && inRC(XMM, src);
2806         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2807         emitByte(0x53);
2808         emitModRM(dst, src);
2809     }
2810 
2811     public final void ret(int imm16) {
2812         if (imm16 == 0) {
2813             emitByte(0xC3);
2814         } else {
2815             emitByte(0xC2);
2816             emitShort(imm16);
2817         }
2818     }
2819 
2820     public final void sarl(Register dst, int imm8) {
2821         prefix(dst);
2822         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2823         if (imm8 == 1) {
2824             emitByte(0xD1);
2825             emitModRM(7, dst);
2826         } else {
2827             emitByte(0xC1);
2828             emitModRM(7, dst);
2829             emitByte(imm8);
2830         }
2831     }
2832 
2833     public final void shll(Register dst, int imm8) {
2834         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2835         prefix(dst);
2836         if (imm8 == 1) {
2837             emitByte(0xD1);
2838             emitModRM(4, dst);
2839         } else {
2840             emitByte(0xC1);
2841             emitModRM(4, dst);
2842             emitByte(imm8);
2843         }
2844     }
2845 
2846     public final void shll(Register dst) {
2847         // Multiply dst by 2, CL times.
2848         prefix(dst);
2849         emitByte(0xD3);
2850         emitModRM(4, dst);
2851     }
2852 
2853     // Insn: SHLX r32a, r/m32, r32b
2854 
2855     public final void shlxl(Register dst, Register src1, Register src2) {
2856         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2857     }
2858 
2859     public final void shrl(Register dst, int imm8) {
2860         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2861         prefix(dst);
2862         emitByte(0xC1);
2863         emitModRM(5, dst);
2864         emitByte(imm8);
2865     }
2866 
2867     public final void shrl(Register dst) {
2868         // Unsigned divide dst by 2, CL times.
2869         prefix(dst);
2870         emitByte(0xD3);
2871         emitModRM(5, dst);
2872     }
2873 
2874     public final void subl(AMD64Address dst, int imm32) {
2875         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2876     }
2877 
2878     public final void subl(Register dst, int imm32) {
2879         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2880     }
2881 
2882     public final void subl(Register dst, Register src) {
2883         SUB.rmOp.emit(this, DWORD, dst, src);
2884     }
2885 
2886     public final void subpd(Register dst, Register src) {
2887         SSEOp.SUB.emit(this, PD, dst, src);
2888     }
2889 
2890     public final void subsd(Register dst, Register src) {
2891         SSEOp.SUB.emit(this, SD, dst, src);
2892     }
2893 
2894     public final void subsd(Register dst, AMD64Address src) {
2895         SSEOp.SUB.emit(this, SD, dst, src);
2896     }
2897 
2898     public final void testl(Register dst, int imm32) {
2899         // not using emitArith because test
2900         // doesn't support sign-extension of
2901         // 8bit operands
2902         if (dst.encoding == 0) {
2903             emitByte(0xA9);
2904         } else {
2905             prefix(dst);
2906             emitByte(0xF7);
2907             emitModRM(0, dst);
2908         }
2909         emitInt(imm32);
2910     }
2911 
2912     public final void testl(Register dst, Register src) {
2913         prefix(dst, src);
2914         emitByte(0x85);
2915         emitModRM(dst, src);
2916     }
2917 
2918     public final void testl(Register dst, AMD64Address src) {
2919         prefix(src, dst);
2920         emitByte(0x85);
2921         emitOperandHelper(dst, src, 0);
2922     }
2923 
2924     public final void unpckhpd(Register dst, Register src) {
2925         assert inRC(XMM, dst) && inRC(XMM, src);
2926         simdPrefix(dst, dst, src, PD, P_0F, false);
2927         emitByte(0x15);
2928         emitModRM(dst, src);
2929     }
2930 
2931     public final void unpcklpd(Register dst, Register src) {
2932         assert inRC(XMM, dst) && inRC(XMM, src);
2933         simdPrefix(dst, dst, src, PD, P_0F, false);
2934         emitByte(0x14);
2935         emitModRM(dst, src);
2936     }
2937 
2938     public final void xorl(Register dst, Register src) {
2939         XOR.rmOp.emit(this, DWORD, dst, src);
2940     }
2941 
2942     public final void xorq(Register dst, Register src) {
2943         XOR.rmOp.emit(this, QWORD, dst, src);
2944     }
2945 
2946     public final void xorpd(Register dst, Register src) {
2947         SSEOp.XOR.emit(this, PD, dst, src);
2948     }
2949 
2950     public final void xorps(Register dst, Register src) {
2951         SSEOp.XOR.emit(this, PS, dst, src);
2952     }
2953 
2954     protected final void decl(Register dst) {
2955         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2956         prefix(dst);
2957         emitByte(0xFF);
2958         emitModRM(1, dst);
2959     }
2960 
2961     protected final void incl(Register dst) {
2962         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2963         prefix(dst);
2964         emitByte(0xFF);
2965         emitModRM(0, dst);
2966     }
2967 
2968     public final void addq(Register dst, int imm32) {
2969         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2970     }
2971 
2972     public final void addq(AMD64Address dst, int imm32) {
2973         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2974     }
2975 
2976     public final void addq(Register dst, Register src) {
2977         ADD.rmOp.emit(this, QWORD, dst, src);
2978     }
2979 
2980     public final void addq(AMD64Address dst, Register src) {
2981         ADD.mrOp.emit(this, QWORD, dst, src);
2982     }
2983 
2984     public final void andq(Register dst, int imm32) {
2985         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2986     }
2987 
2988     public final void bsrq(Register dst, Register src) {
2989         prefixq(dst, src);
2990         emitByte(0x0F);
2991         emitByte(0xBD);
2992         emitModRM(dst, src);
2993     }
2994 
2995     public final void bswapq(Register reg) {
2996         prefixq(reg);
2997         emitByte(0x0F);
2998         emitByte(0xC8 + encode(reg));
2999     }
3000 
3001     public final void cdqq() {
3002         rexw();
3003         emitByte(0x99);
3004     }
3005 
3006     public final void repStosb() {
3007         emitByte(0xf3);
3008         rexw();
3009         emitByte(0xaa);
3010     }
3011 
3012     public final void repStosq() {
3013         emitByte(0xf3);
3014         rexw();
3015         emitByte(0xab);
3016     }
3017 
3018     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
3019         prefixq(dst, src);
3020         emitByte(0x0F);
3021         emitByte(0x40 | cc.getValue());
3022         emitModRM(dst, src);
3023     }
3024 
3025     public final void setb(ConditionFlag cc, Register dst) {
3026         prefix(dst, true);
3027         emitByte(0x0F);
3028         emitByte(0x90 | cc.getValue());
3029         emitModRM(0, dst);
3030     }
3031 
3032     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
3033         prefixq(src, dst);
3034         emitByte(0x0F);
3035         emitByte(0x40 | cc.getValue());
3036         emitOperandHelper(dst, src, 0);
3037     }
3038 
3039     public final void cmpq(Register dst, int imm32) {
3040         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3041     }
3042 
3043     public final void cmpq(Register dst, Register src) {
3044         CMP.rmOp.emit(this, QWORD, dst, src);
3045     }
3046 
3047     public final void cmpq(Register dst, AMD64Address src) {
3048         CMP.rmOp.emit(this, QWORD, dst, src);
3049     }
3050 
3051     public final void cmpxchgq(Register reg, AMD64Address adr) {
3052         prefixq(adr, reg);
3053         emitByte(0x0F);
3054         emitByte(0xB1);
3055         emitOperandHelper(reg, adr, 0);
3056     }
3057 
3058     public final void cvtdq2pd(Register dst, Register src) {
3059         assert inRC(XMM, dst) && inRC(XMM, src);
3060         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3061         emitByte(0xE6);
3062         emitModRM(dst, src);
3063     }
3064 
3065     public final void cvtsi2sdq(Register dst, Register src) {
3066         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
3067     }
3068 
3069     public final void cvttsd2siq(Register dst, Register src) {
3070         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
3071     }
3072 
3073     public final void cvttpd2dq(Register dst, Register src) {
3074         assert inRC(XMM, dst) && inRC(XMM, src);
3075         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3076         emitByte(0xE6);
3077         emitModRM(dst, src);
3078     }
3079 
3080     public final void decq(Register dst) {
3081         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3082         prefixq(dst);
3083         emitByte(0xFF);
3084         emitModRM(1, dst);
3085     }
3086 
3087     public final void decq(AMD64Address dst) {
3088         DEC.emit(this, QWORD, dst);
3089     }
3090 
3091     public final void imulq(Register dst, Register src) {
3092         prefixq(dst, src);
3093         emitByte(0x0F);
3094         emitByte(0xAF);
3095         emitModRM(dst, src);
3096     }
3097 
3098     public final void incq(Register dst) {
3099         // Don't use it directly. Use Macroincrementq() instead.
3100         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3101         prefixq(dst);
3102         emitByte(0xFF);
3103         emitModRM(0, dst);
3104     }
3105 
3106     public final void incq(AMD64Address dst) {
3107         INC.emit(this, QWORD, dst);
3108     }
3109 
3110     public final void movq(Register dst, long imm64) {
3111         movq(dst, imm64, false);
3112     }
3113 
3114     public final void movq(Register dst, long imm64, boolean annotateImm) {
3115         int insnPos = position();
3116         prefixq(dst);
3117         emitByte(0xB8 + encode(dst));
3118         int immPos = position();
3119         emitLong(imm64);
3120         int nextInsnPos = position();
3121         if (annotateImm && codePatchingAnnotationConsumer != null) {
3122             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3123         }
3124     }
3125 
3126     public final void movslq(Register dst, int imm32) {
3127         prefixq(dst);
3128         emitByte(0xC7);
3129         emitModRM(0, dst);
3130         emitInt(imm32);
3131     }
3132 
3133     public final void movdq(Register dst, AMD64Address src) {
3134         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3135     }
3136 
3137     public final void movdq(AMD64Address dst, Register src) {
3138         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3139     }
3140 
3141     public final void movdq(Register dst, Register src) {
3142         if (inRC(XMM, dst) && inRC(CPU, src)) {
3143             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3144         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3145             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3146         } else {
3147             throw new InternalError("should not reach here");
3148         }
3149     }
3150 
3151     public final void movdl(Register dst, Register src) {
3152         if (inRC(XMM, dst) && inRC(CPU, src)) {
3153             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3154         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3155             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3156         } else {
3157             throw new InternalError("should not reach here");
3158         }
3159     }
3160 
3161     public final void movdl(Register dst, AMD64Address src) {
3162         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3163     }
3164 
3165     public final void movddup(Register dst, Register src) {
3166         assert supports(CPUFeature.SSE3);
3167         assert inRC(XMM, dst) && inRC(XMM, src);
3168         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3169         emitByte(0x12);
3170         emitModRM(dst, src);
3171     }
3172 
3173     public final void movdqu(Register dst, AMD64Address src) {
3174         assert inRC(XMM, dst);
3175         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3176         emitByte(0x6F);
3177         emitOperandHelper(dst, src, 0);
3178     }
3179 
3180     public final void movdqu(Register dst, Register src) {
3181         assert inRC(XMM, dst) && inRC(XMM, src);
3182         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3183         emitByte(0x6F);
3184         emitModRM(dst, src);
3185     }
3186 
3187     // Insn: VMOVDQU xmm2/m128, xmm1
3188 
3189     public final void movdqu(AMD64Address dst, Register src) {
3190         assert inRC(XMM, src);
3191         // Code: VEX.128.F3.0F.WIG 7F /r
3192         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3193         emitByte(0x7F);
3194         emitOperandHelper(src, dst, 0);
3195     }
3196 
3197     public final void movslq(AMD64Address dst, int imm32) {
3198         prefixq(dst);
3199         emitByte(0xC7);
3200         emitOperandHelper(0, dst, 4);
3201         emitInt(imm32);
3202     }
3203 
3204     public final void movslq(Register dst, AMD64Address src) {
3205         prefixq(src, dst);
3206         emitByte(0x63);
3207         emitOperandHelper(dst, src, 0);
3208     }
3209 
3210     public final void movslq(Register dst, Register src) {
3211         prefixq(dst, src);
3212         emitByte(0x63);
3213         emitModRM(dst, src);
3214     }
3215 
3216     public final void negq(Register dst) {
3217         prefixq(dst);
3218         emitByte(0xF7);
3219         emitModRM(3, dst);
3220     }
3221 
3222     public final void orq(Register dst, Register src) {
3223         OR.rmOp.emit(this, QWORD, dst, src);
3224     }
3225 
3226     public final void shlq(Register dst, int imm8) {
3227         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3228         prefixq(dst);
3229         if (imm8 == 1) {
3230             emitByte(0xD1);
3231             emitModRM(4, dst);
3232         } else {
3233             emitByte(0xC1);
3234             emitModRM(4, dst);
3235             emitByte(imm8);
3236         }
3237     }
3238 
3239     public final void shlq(Register dst) {
3240         // Multiply dst by 2, CL times.
3241         prefixq(dst);
3242         emitByte(0xD3);
3243         emitModRM(4, dst);
3244     }
3245 
3246     public final void shrq(Register dst, int imm8) {
3247         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3248         prefixq(dst);
3249         if (imm8 == 1) {
3250             emitByte(0xD1);
3251             emitModRM(5, dst);
3252         } else {
3253             emitByte(0xC1);
3254             emitModRM(5, dst);
3255             emitByte(imm8);
3256         }
3257     }
3258 
3259     public final void shrq(Register dst) {
3260         prefixq(dst);
3261         emitByte(0xD3);
3262         // Unsigned divide dst by 2, CL times.
3263         emitModRM(5, dst);
3264     }
3265 
3266     public final void sarq(Register dst, int imm8) {
3267         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3268         prefixq(dst);
3269         if (imm8 == 1) {
3270             emitByte(0xD1);
3271             emitModRM(7, dst);
3272         } else {
3273             emitByte(0xC1);
3274             emitModRM(7, dst);
3275             emitByte(imm8);
3276         }
3277     }
3278 
3279     public final void sbbq(Register dst, Register src) {
3280         SBB.rmOp.emit(this, QWORD, dst, src);
3281     }
3282 
3283     public final void subq(Register dst, int imm32) {
3284         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3285     }
3286 
3287     public final void subq(AMD64Address dst, int imm32) {
3288         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3289     }
3290 
3291     public final void subqWide(Register dst, int imm32) {
3292         // don't use the sign-extending version, forcing a 32-bit immediate
3293         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3294     }
3295 
3296     public final void subq(Register dst, Register src) {
3297         SUB.rmOp.emit(this, QWORD, dst, src);
3298     }
3299 
3300     public final void testq(Register dst, Register src) {
3301         prefixq(dst, src);
3302         emitByte(0x85);
3303         emitModRM(dst, src);
3304     }
3305 
3306     public final void btrq(Register src, int imm8) {
3307         prefixq(src);
3308         emitByte(0x0F);
3309         emitByte(0xBA);
3310         emitModRM(6, src);
3311         emitByte(imm8);
3312     }
3313 
3314     public final void xaddb(AMD64Address dst, Register src) {
3315         prefixb(dst, src);
3316         emitByte(0x0F);
3317         emitByte(0xC0);
3318         emitOperandHelper(src, dst, 0);
3319     }
3320 
3321     public final void xaddw(AMD64Address dst, Register src) {
3322         emitByte(0x66); // Switch to 16-bit mode.
3323         prefix(dst, src);
3324         emitByte(0x0F);
3325         emitByte(0xC1);
3326         emitOperandHelper(src, dst, 0);
3327     }
3328 
3329     public final void xaddl(AMD64Address dst, Register src) {
3330         prefix(dst, src);
3331         emitByte(0x0F);
3332         emitByte(0xC1);
3333         emitOperandHelper(src, dst, 0);
3334     }
3335 
3336     public final void xaddq(AMD64Address dst, Register src) {
3337         prefixq(dst, src);
3338         emitByte(0x0F);
3339         emitByte(0xC1);
3340         emitOperandHelper(src, dst, 0);
3341     }
3342 
3343     public final void xchgb(Register dst, AMD64Address src) {
3344         prefixb(src, dst);
3345         emitByte(0x86);
3346         emitOperandHelper(dst, src, 0);
3347     }
3348 
3349     public final void xchgw(Register dst, AMD64Address src) {
3350         emitByte(0x66);
3351         prefix(src, dst);
3352         emitByte(0x87);
3353         emitOperandHelper(dst, src, 0);
3354     }
3355 
3356     public final void xchgl(Register dst, AMD64Address src) {
3357         prefix(src, dst);
3358         emitByte(0x87);
3359         emitOperandHelper(dst, src, 0);
3360     }
3361 
3362     public final void xchgq(Register dst, AMD64Address src) {
3363         prefixq(src, dst);
3364         emitByte(0x87);
3365         emitOperandHelper(dst, src, 0);
3366     }
3367 
3368     public final void membar(int barriers) {
3369         if (target.isMP) {
3370             // We only have to handle StoreLoad
3371             if ((barriers & STORE_LOAD) != 0) {
3372                 // All usable chips support "locked" instructions which suffice
3373                 // as barriers, and are much faster than the alternative of
3374                 // using cpuid instruction. We use here a locked add [rsp],0.
3375                 // This is conveniently otherwise a no-op except for blowing
3376                 // flags.
3377                 // Any change to this code may need to revisit other places in
3378                 // the code where this idiom is used, in particular the
3379                 // orderAccess code.
3380                 lock();
3381                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3382             }
3383         }
3384     }
3385 
3386     @Override
3387     protected final void patchJumpTarget(int branch, int branchTarget) {
3388         int op = getByte(branch);
3389         assert op == 0xE8 // call
3390                         || op == 0x00 // jump table entry
3391                         || op == 0xE9 // jmp
3392                         || op == 0xEB // short jmp
3393                         || (op & 0xF0) == 0x70 // short jcc
3394                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3395         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3396 
3397         if (op == 0x00) {
3398             int offsetToJumpTableBase = getShort(branch + 1);
3399             int jumpTableBase = branch - offsetToJumpTableBase;
3400             int imm32 = branchTarget - jumpTableBase;
3401             emitInt(imm32, branch);
3402         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3403 
3404             // short offset operators (jmp and jcc)
3405             final int imm8 = branchTarget - (branch + 2);
3406             /*
3407              * Since a wrongly patched short branch can potentially lead to working but really bad
3408              * behaving code we should always fail with an exception instead of having an assert.
3409              */
3410             GraalError.guarantee(isByte(imm8), "Displacement too large to be encoded as a byte: %d", imm8);
3411             emitByte(imm8, branch + 1);
3412 
3413         } else {
3414 
3415             int off = 1;
3416             if (op == 0x0F) {
3417                 off = 2;
3418             }
3419 
3420             int imm32 = branchTarget - (branch + 4 + off);
3421             emitInt(imm32, branch + off);
3422         }
3423     }
3424 
3425     public void nullCheck(AMD64Address address) {
3426         testl(AMD64.rax, address);
3427     }
3428 
3429     @Override
3430     public void align(int modulus) {
3431         if (position() % modulus != 0) {
3432             nop(modulus - (position() % modulus));
3433         }
3434     }
3435 
3436     /**
3437      * Emits a direct call instruction. Note that the actual call target is not specified, because
3438      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3439      * responsible to add the call address to the appropriate patching tables.
3440      */
3441     public final void call() {
3442         annotatePatchingImmediate(1, 4);
3443         emitByte(0xE8);
3444         emitInt(0);
3445     }
3446 
3447     public final void call(Register src) {
3448         prefix(src);
3449         emitByte(0xFF);
3450         emitModRM(2, src);
3451     }
3452 
3453     public final void int3() {
3454         emitByte(0xCC);
3455     }
3456 
3457     public final void pause() {
3458         emitByte(0xF3);
3459         emitByte(0x90);
3460     }
3461 
3462     private void emitx87(int b1, int b2, int i) {
3463         assert 0 <= i && i < 8 : "illegal stack offset";
3464         emitByte(b1);
3465         emitByte(b2 + i);
3466     }
3467 
3468     public final void fldd(AMD64Address src) {
3469         emitByte(0xDD);
3470         emitOperandHelper(0, src, 0);
3471     }
3472 
3473     public final void flds(AMD64Address src) {
3474         emitByte(0xD9);
3475         emitOperandHelper(0, src, 0);
3476     }
3477 
3478     public final void fldln2() {
3479         emitByte(0xD9);
3480         emitByte(0xED);
3481     }
3482 
3483     public final void fldlg2() {
3484         emitByte(0xD9);
3485         emitByte(0xEC);
3486     }
3487 
3488     public final void fyl2x() {
3489         emitByte(0xD9);
3490         emitByte(0xF1);
3491     }
3492 
3493     public final void fstps(AMD64Address src) {
3494         emitByte(0xD9);
3495         emitOperandHelper(3, src, 0);
3496     }
3497 
3498     public final void fstpd(AMD64Address src) {
3499         emitByte(0xDD);
3500         emitOperandHelper(3, src, 0);
3501     }
3502 
3503     private void emitFPUArith(int b1, int b2, int i) {
3504         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3505         emitByte(b1);
3506         emitByte(b2 + i);
3507     }
3508 
3509     public void ffree(int i) {
3510         emitFPUArith(0xDD, 0xC0, i);
3511     }
3512 
3513     public void fincstp() {
3514         emitByte(0xD9);
3515         emitByte(0xF7);
3516     }
3517 
3518     public void fxch(int i) {
3519         emitFPUArith(0xD9, 0xC8, i);
3520     }
3521 
3522     public void fnstswAX() {
3523         emitByte(0xDF);
3524         emitByte(0xE0);
3525     }
3526 
3527     public void fwait() {
3528         emitByte(0x9B);
3529     }
3530 
3531     public void fprem() {
3532         emitByte(0xD9);
3533         emitByte(0xF8);
3534     }
3535 
3536     public final void fsin() {
3537         emitByte(0xD9);
3538         emitByte(0xFE);
3539     }
3540 
3541     public final void fcos() {
3542         emitByte(0xD9);
3543         emitByte(0xFF);
3544     }
3545 
3546     public final void fptan() {
3547         emitByte(0xD9);
3548         emitByte(0xF2);
3549     }
3550 
3551     public final void fstp(int i) {
3552         emitx87(0xDD, 0xD8, i);
3553     }
3554 
3555     @Override
3556     public AMD64Address makeAddress(Register base, int displacement) {
3557         return new AMD64Address(base, displacement);
3558     }
3559 
3560     @Override
3561     public AMD64Address getPlaceholder(int instructionStartPosition) {
3562         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3563     }
3564 
3565     private void prefetchPrefix(AMD64Address src) {
3566         prefix(src);
3567         emitByte(0x0F);
3568     }
3569 
3570     public void prefetchnta(AMD64Address src) {
3571         prefetchPrefix(src);
3572         emitByte(0x18);
3573         emitOperandHelper(0, src, 0);
3574     }
3575 
3576     void prefetchr(AMD64Address src) {
3577         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3578         prefetchPrefix(src);
3579         emitByte(0x0D);
3580         emitOperandHelper(0, src, 0);
3581     }
3582 
3583     public void prefetcht0(AMD64Address src) {
3584         assert supports(CPUFeature.SSE);
3585         prefetchPrefix(src);
3586         emitByte(0x18);
3587         emitOperandHelper(1, src, 0);
3588     }
3589 
3590     public void prefetcht1(AMD64Address src) {
3591         assert supports(CPUFeature.SSE);
3592         prefetchPrefix(src);
3593         emitByte(0x18);
3594         emitOperandHelper(2, src, 0);
3595     }
3596 
3597     public void prefetcht2(AMD64Address src) {
3598         assert supports(CPUFeature.SSE);
3599         prefix(src);
3600         emitByte(0x0f);
3601         emitByte(0x18);
3602         emitOperandHelper(3, src, 0);
3603     }
3604 
3605     public void prefetchw(AMD64Address src) {
3606         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3607         prefix(src);
3608         emitByte(0x0f);
3609         emitByte(0x0D);
3610         emitOperandHelper(1, src, 0);
3611     }
3612 
3613     public void rdtsc() {
3614         emitByte(0x0F);
3615         emitByte(0x31);
3616     }
3617 
3618     /**
3619      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3620      * to crash the program (debugging etc.).
3621      */
3622     public void illegal() {
3623         emitByte(0x0f);
3624         emitByte(0x0b);
3625     }
3626 
3627     public void lfence() {
3628         emitByte(0x0f);
3629         emitByte(0xae);
3630         emitByte(0xe8);
3631     }
3632 
3633     public final void vptest(Register dst, Register src) {
3634         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3635     }
3636 
3637     public final void vpxor(Register dst, Register nds, Register src) {
3638         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3639     }
3640 
3641     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3642         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3643     }
3644 
3645     public final void vmovdqu(Register dst, AMD64Address src) {
3646         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3647     }
3648 
3649     public final void vmovdqu(AMD64Address dst, Register src) {
3650         assert inRC(XMM, src);
3651         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3652     }
3653 
3654     public final void vpmovzxbw(Register dst, AMD64Address src) {
3655         assert supports(CPUFeature.AVX2);
3656         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3657     }
3658 
3659     public final void vzeroupper() {
3660         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3661         emitByte(0x77);
3662     }
3663 
3664     // Insn: KORTESTD k1, k2
3665 
3666     // This instruction produces ZF or CF flags
3667     public final void kortestd(Register src1, Register src2) {
3668         assert supports(CPUFeature.AVX512BW);
3669         assert inRC(MASK, src1) && inRC(MASK, src2);
3670         // Code: VEX.L0.66.0F.W1 98 /r
3671         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3672         emitByte(0x98);
3673         emitModRM(src1, src2);
3674     }
3675 
3676     // Insn: KORTESTQ k1, k2
3677 
3678     // This instruction produces ZF or CF flags
3679     public final void kortestq(Register src1, Register src2) {
3680         assert supports(CPUFeature.AVX512BW);
3681         assert inRC(MASK, src1) && inRC(MASK, src2);
3682         // Code: VEX.L0.0F.W1 98 /r
3683         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3684         emitByte(0x98);
3685         emitModRM(src1, src2);
3686     }
3687 
3688     public final void kmovd(Register dst, Register src) {
3689         assert supports(CPUFeature.AVX512BW);
3690         assert inRC(MASK, dst) || inRC(CPU, dst);
3691         assert inRC(MASK, src) || inRC(CPU, src);
3692         assert !(inRC(CPU, dst) && inRC(CPU, src));
3693 
3694         if (inRC(MASK, dst)) {
3695             if (inRC(MASK, src)) {
3696                 // kmovd(KRegister dst, KRegister src):
3697                 // Insn: KMOVD k1, k2/m32
3698                 // Code: VEX.L0.66.0F.W1 90 /r
3699                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3700                 emitByte(0x90);
3701                 emitModRM(dst, src);
3702             } else {
3703                 // kmovd(KRegister dst, Register src)
3704                 // Insn: KMOVD k1, r32
3705                 // Code: VEX.L0.F2.0F.W0 92 /r
3706                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3707                 emitByte(0x92);
3708                 emitModRM(dst, src);
3709             }
3710         } else {
3711             if (inRC(MASK, src)) {
3712                 // kmovd(Register dst, KRegister src)
3713                 // Insn: KMOVD r32, k1
3714                 // Code: VEX.L0.F2.0F.W0 93 /r
3715                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3716                 emitByte(0x93);
3717                 emitModRM(dst, src);
3718             } else {
3719                 throw GraalError.shouldNotReachHere();
3720             }
3721         }
3722     }
3723 
3724     public final void kmovq(Register dst, Register src) {
3725         assert supports(CPUFeature.AVX512BW);
3726         assert inRC(MASK, dst) || inRC(CPU, dst);
3727         assert inRC(MASK, src) || inRC(CPU, src);
3728         assert !(inRC(CPU, dst) && inRC(CPU, src));
3729 
3730         if (inRC(MASK, dst)) {
3731             if (inRC(MASK, src)) {
3732                 // kmovq(KRegister dst, KRegister src):
3733                 // Insn: KMOVQ k1, k2/m64
3734                 // Code: VEX.L0.0F.W1 90 /r
3735                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3736                 emitByte(0x90);
3737                 emitModRM(dst, src);
3738             } else {
3739                 // kmovq(KRegister dst, Register src)
3740                 // Insn: KMOVQ k1, r64
3741                 // Code: VEX.L0.F2.0F.W1 92 /r
3742                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3743                 emitByte(0x92);
3744                 emitModRM(dst, src);
3745             }
3746         } else {
3747             if (inRC(MASK, src)) {
3748                 // kmovq(Register dst, KRegister src)
3749                 // Insn: KMOVQ r64, k1
3750                 // Code: VEX.L0.F2.0F.W1 93 /r
3751                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3752                 emitByte(0x93);
3753                 emitModRM(dst, src);
3754             } else {
3755                 throw GraalError.shouldNotReachHere();
3756             }
3757         }
3758     }
3759 
3760     // Insn: KTESTD k1, k2
3761 
3762     public final void ktestd(Register src1, Register src2) {
3763         assert supports(CPUFeature.AVX512BW);
3764         assert inRC(MASK, src1) && inRC(MASK, src2);
3765         // Code: VEX.L0.66.0F.W1 99 /r
3766         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3767         emitByte(0x99);
3768         emitModRM(src1, src2);
3769     }
3770 
3771     public final void evmovdqu64(Register dst, AMD64Address src) {
3772         assert supports(CPUFeature.AVX512F);
3773         assert inRC(XMM, dst);
3774         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3775         emitByte(0x6F);
3776         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3777     }
3778 
3779     // Insn: VPMOVZXBW zmm1, m256
3780 
3781     public final void evpmovzxbw(Register dst, AMD64Address src) {
3782         assert supports(CPUFeature.AVX512BW);
3783         assert inRC(XMM, dst);
3784         // Code: EVEX.512.66.0F38.WIG 30 /r
3785         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3786         emitByte(0x30);
3787         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3788     }
3789 
3790     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3791         assert supports(CPUFeature.AVX512BW);
3792         assert inRC(MASK, kdst) && inRC(XMM, nds);
3793         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3794         emitByte(0x74);
3795         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3796     }
3797 
3798     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3799     // -----
3800     // Insn: VMOVDQU16 zmm1, m512
3801 
3802     public final void evmovdqu16(Register dst, AMD64Address src) {
3803         assert supports(CPUFeature.AVX512BW);
3804         assert inRC(XMM, dst);
3805         // Code: EVEX.512.F2.0F.W1 6F /r
3806         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3807         emitByte(0x6F);
3808         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3809     }
3810 
3811     // Insn: VMOVDQU16 zmm1, k1:z, m512
3812 
3813     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3814         assert supports(CPUFeature.AVX512BW);
3815         assert inRC(XMM, dst) && inRC(MASK, mask);
3816         // Code: EVEX.512.F2.0F.W1 6F /r
3817         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3818         emitByte(0x6F);
3819         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3820     }
3821 
3822     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3823     // -----
3824     // Insn: VMOVDQU16 m512, zmm1
3825 
3826     public final void evmovdqu16(AMD64Address dst, Register src) {
3827         assert supports(CPUFeature.AVX512BW);
3828         assert inRC(XMM, src);
3829         // Code: EVEX.512.F2.0F.W1 7F /r
3830         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3831         emitByte(0x7F);
3832         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3833     }
3834 
3835     // Insn: VMOVDQU16 m512, k1, zmm1
3836 
3837     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3838         assert supports(CPUFeature.AVX512BW);
3839         assert inRC(MASK, mask) && inRC(XMM, src);
3840         // Code: EVEX.512.F2.0F.W1 7F /r
3841         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3842         emitByte(0x7F);
3843         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3844     }
3845 
3846     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3847     // -----
3848     // Insn: VPBROADCASTW zmm1, reg
3849 
3850     public final void evpbroadcastw(Register dst, Register src) {
3851         assert supports(CPUFeature.AVX512BW);
3852         assert inRC(XMM, dst) && inRC(CPU, src);
3853         // Code: EVEX.512.66.0F38.W0 7B /r
3854         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3855         emitByte(0x7B);
3856         emitModRM(dst, src);
3857     }
3858 
3859     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3860     // -----
3861     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3862 
3863     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3864         assert supports(CPUFeature.AVX512BW);
3865         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3866         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3867         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3868         emitByte(0x3E);
3869         emitModRM(kdst, src);
3870         emitByte(vcc);
3871     }
3872 
3873     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3874     // -----
3875     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3876 
3877     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3878         assert supports(CPUFeature.AVX512BW);
3879         assert inRC(MASK, kdst) && inRC(MASK, mask);
3880         assert inRC(XMM, nds) && inRC(XMM, src);
3881         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3882         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3883         emitByte(0x3E);
3884         emitModRM(kdst, src);
3885         emitByte(vcc);
3886     }
3887 
3888     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3889     // -----
3890     // Insn: VPMOVWB m256, zmm2
3891 
3892     public final void evpmovwb(AMD64Address dst, Register src) {
3893         assert supports(CPUFeature.AVX512BW);
3894         assert inRC(XMM, src);
3895         // Code: EVEX.512.F3.0F38.W0 30 /r
3896         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3897         emitByte(0x30);
3898         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3899     }
3900 
3901     // Insn: VPMOVWB m256, k1, zmm2
3902 
3903     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
3904         assert supports(CPUFeature.AVX512BW);
3905         assert inRC(MASK, mask) && inRC(XMM, src);
3906         // Code: EVEX.512.F3.0F38.W0 30 /r
3907         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3908         emitByte(0x30);
3909         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3910     }
3911 
3912     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
3913     // -----
3914     // Insn: VPMOVZXBW zmm1, k1, m256
3915 
3916     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
3917         assert supports(CPUFeature.AVX512BW);
3918         assert inRC(MASK, mask) && inRC(XMM, dst);
3919         // Code: EVEX.512.66.0F38.WIG 30 /r
3920         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3921         emitByte(0x30);
3922         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3923     }
3924 
3925 }