1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  68 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  69 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  70 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  71 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  72 
  73 import java.util.EnumSet;
  74 
  75 import org.graalvm.compiler.asm.Label;
  76 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  77 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  78 import org.graalvm.compiler.core.common.calc.Condition;
  79 import org.graalvm.compiler.debug.GraalError;
  80 
  81 import jdk.vm.ci.amd64.AMD64;
  82 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  83 import jdk.vm.ci.code.Register;
  84 import jdk.vm.ci.code.Register.RegisterCategory;
  85 import jdk.vm.ci.code.TargetDescription;
  86 
  87 /**
  88  * This class implements an assembler that can encode most X86 instructions.
  89  */
  90 public class AMD64Assembler extends AMD64BaseAssembler {
  91 
  92     /**
  93      * Constructs an assembler for the AMD64 architecture.
  94      */
  95     public AMD64Assembler(TargetDescription target) {
  96         super(target);
  97     }
  98 
  99     /**
 100      * The x86 condition codes used for conditional jumps/moves.
 101      */
 102     public enum ConditionFlag {
 103         Zero(0x4, "|zero|"),
 104         NotZero(0x5, "|nzero|"),
 105         Equal(0x4, "="),
 106         NotEqual(0x5, "!="),
 107         Less(0xc, "<"),
 108         LessEqual(0xe, "<="),
 109         Greater(0xf, ">"),
 110         GreaterEqual(0xd, ">="),
 111         Below(0x2, "|<|"),
 112         BelowEqual(0x6, "|<=|"),
 113         Above(0x7, "|>|"),
 114         AboveEqual(0x3, "|>=|"),
 115         Overflow(0x0, "|of|"),
 116         NoOverflow(0x1, "|nof|"),
 117         CarrySet(0x2, "|carry|"),
 118         CarryClear(0x3, "|ncarry|"),
 119         Negative(0x8, "|neg|"),
 120         Positive(0x9, "|pos|"),
 121         Parity(0xa, "|par|"),
 122         NoParity(0xb, "|npar|");
 123 
 124         private final int value;
 125         private final String operator;
 126 
 127         ConditionFlag(int value, String operator) {
 128             this.value = value;
 129             this.operator = operator;
 130         }
 131 
 132         public ConditionFlag negate() {
 133             switch (this) {
 134                 case Zero:
 135                     return NotZero;
 136                 case NotZero:
 137                     return Zero;
 138                 case Equal:
 139                     return NotEqual;
 140                 case NotEqual:
 141                     return Equal;
 142                 case Less:
 143                     return GreaterEqual;
 144                 case LessEqual:
 145                     return Greater;
 146                 case Greater:
 147                     return LessEqual;
 148                 case GreaterEqual:
 149                     return Less;
 150                 case Below:
 151                     return AboveEqual;
 152                 case BelowEqual:
 153                     return Above;
 154                 case Above:
 155                     return BelowEqual;
 156                 case AboveEqual:
 157                     return Below;
 158                 case Overflow:
 159                     return NoOverflow;
 160                 case NoOverflow:
 161                     return Overflow;
 162                 case CarrySet:
 163                     return CarryClear;
 164                 case CarryClear:
 165                     return CarrySet;
 166                 case Negative:
 167                     return Positive;
 168                 case Positive:
 169                     return Negative;
 170                 case Parity:
 171                     return NoParity;
 172                 case NoParity:
 173                     return Parity;
 174             }
 175             throw new IllegalArgumentException();
 176         }
 177 
 178         public int getValue() {
 179             return value;
 180         }
 181 
 182         @Override
 183         public String toString() {
 184             return operator;
 185         }
 186     }
 187 
 188     /**
 189      * Operand size and register type constraints.
 190      */
 191     private enum OpAssertion {
 192         ByteAssertion(CPU, CPU, BYTE),
 193         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 194         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 195         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 196         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 197         QwordAssertion(CPU, CPU, QWORD),
 198         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 199         PackedFloatAssertion(XMM, XMM, PS, PD),
 200         SingleAssertion(XMM, XMM, SS),
 201         DoubleAssertion(XMM, XMM, SD),
 202         PackedDoubleAssertion(XMM, XMM, PD),
 203         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 204         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 205 
 206         private final RegisterCategory resultCategory;
 207         private final RegisterCategory inputCategory;
 208         private final OperandSize[] allowedSizes;
 209 
 210         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 211             this.resultCategory = resultCategory;
 212             this.inputCategory = inputCategory;
 213             this.allowedSizes = allowedSizes;
 214         }
 215 
 216         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 217             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 218             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 219 
 220             for (OperandSize s : allowedSizes) {
 221                 if (size == s) {
 222                     return true;
 223                 }
 224             }
 225 
 226             assert false : "invalid operand size " + size + " used in " + op;
 227             return false;
 228         }
 229 
 230     }
 231 
 232     protected static final int P_0F = 0x0F;
 233     protected static final int P_0F38 = 0x380F;
 234     protected static final int P_0F3A = 0x3A0F;
 235 
 236     /**
 237      * Base class for AMD64 opcodes.
 238      */
 239     public static class AMD64Op {
 240 
 241         private final String opcode;
 242 
 243         protected final int prefix1;
 244         protected final int prefix2;
 245         protected final int op;
 246 
 247         private final boolean dstIsByte;
 248         private final boolean srcIsByte;
 249 
 250         private final OpAssertion assertion;
 251         private final CPUFeature feature;
 252 
 253         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 254             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 255         }
 256 
 257         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 258             this.opcode = opcode;
 259             this.prefix1 = prefix1;
 260             this.prefix2 = prefix2;
 261             this.op = op;
 262 
 263             this.dstIsByte = dstIsByte;
 264             this.srcIsByte = srcIsByte;
 265 
 266             this.assertion = assertion;
 267             this.feature = feature;
 268         }
 269 
 270         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 271             if (prefix1 != 0) {
 272                 asm.emitByte(prefix1);
 273             }
 274             if (size.getSizePrefix() != 0) {
 275                 asm.emitByte(size.getSizePrefix());
 276             }
 277             int rexPrefix = 0x40 | rxb;
 278             if (size == QWORD) {
 279                 rexPrefix |= 0x08;
 280             }
 281             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 282                 asm.emitByte(rexPrefix);
 283             }
 284             if (prefix2 > 0xFF) {
 285                 asm.emitShort(prefix2);
 286             } else if (prefix2 > 0) {
 287                 asm.emitByte(prefix2);
 288             }
 289             asm.emitByte(op);
 290         }
 291 
 292         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 293             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 294             assert assertion.checkOperands(this, size, resultReg, inputReg);
 295             return true;
 296         }
 297 
 298         public OperandSize[] getAllowedSizes() {
 299             return assertion.allowedSizes;
 300         }
 301 
 302         protected final boolean isSSEInstruction() {
 303             if (feature == null) {
 304                 return false;
 305             }
 306             switch (feature) {
 307                 case SSE:
 308                 case SSE2:
 309                 case SSE3:
 310                 case SSSE3:
 311                 case SSE4A:
 312                 case SSE4_1:
 313                 case SSE4_2:
 314                     return true;
 315                 default:
 316                     return false;
 317             }
 318         }
 319 
 320         public final OpAssertion getAssertion() {
 321             return assertion;
 322         }
 323 
 324         @Override
 325         public String toString() {
 326             return opcode;
 327         }
 328     }
 329 
 330     /**
 331      * Base class for AMD64 opcodes with immediate operands.
 332      */
 333     public static class AMD64ImmOp extends AMD64Op {
 334 
 335         private final boolean immIsByte;
 336 
 337         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 338             this(opcode, immIsByte, prefix, op, assertion, null);
 339         }
 340 
 341         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 342             super(opcode, 0, prefix, op, assertion, feature);
 343             this.immIsByte = immIsByte;
 344         }
 345 
 346         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 347             if (immIsByte) {
 348                 assert imm == (byte) imm;
 349                 asm.emitByte(imm);
 350             } else {
 351                 size.emitImmediate(asm, imm);
 352             }
 353         }
 354 
 355         protected final int immediateSize(OperandSize size) {
 356             if (immIsByte) {
 357                 return 1;
 358             } else {
 359                 return size.getBytes();
 360             }
 361         }
 362     }
 363 
 364     /**
 365      * Opcode with operand order of either RM or MR for 2 address forms.
 366      */
 367     public abstract static class AMD64RROp extends AMD64Op {
 368 
 369         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 370             super(opcode, prefix1, prefix2, op, assertion, feature);
 371         }
 372 
 373         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 374             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 375         }
 376 
 377         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 378     }
 379 
 380     /**
 381      * Opcode with operand order of RM.
 382      */
 383     public static class AMD64RMOp extends AMD64RROp {
 384         // @formatter:off
 385         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 386         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 387         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 388         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 389         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 390         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 391         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 392         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 393         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 394         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 395         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 396         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 399         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 400         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 401         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 402 
 403         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 404         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 405         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 407         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408 
 409         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 410         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 411         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 412         // @formatter:on
 413 
 414         protected AMD64RMOp(String opcode, int op) {
 415             this(opcode, 0, op);
 416         }
 417 
 418         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 419             this(opcode, 0, op, assertion);
 420         }
 421 
 422         protected AMD64RMOp(String opcode, int prefix, int op) {
 423             this(opcode, 0, prefix, op, null);
 424         }
 425 
 426         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 427             this(opcode, 0, prefix, op, assertion, null);
 428         }
 429 
 430         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 431             this(opcode, 0, prefix, op, assertion, feature);
 432         }
 433 
 434         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 435             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 436         }
 437 
 438         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 439             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 440         }
 441 
 442         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 443             super(opcode, prefix1, prefix2, op, assertion, feature);
 444         }
 445 
 446         @Override
 447         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 448             assert verify(asm, size, dst, src);
 449             if (isSSEInstruction()) {
 450                 Register nds = Register.None;
 451                 switch (op) {
 452                     case 0x10:
 453                     case 0x51:
 454                         if ((size == SS) || (size == SD)) {
 455                             nds = dst;
 456                         }
 457                         break;
 458                     case 0x2A:
 459                     case 0x54:
 460                     case 0x55:
 461                     case 0x56:
 462                     case 0x57:
 463                     case 0x58:
 464                     case 0x59:
 465                     case 0x5A:
 466                     case 0x5C:
 467                     case 0x5D:
 468                     case 0x5E:
 469                     case 0x5F:
 470                         nds = dst;
 471                         break;
 472                     default:
 473                         break;
 474                 }
 475                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 476                 asm.emitByte(op);
 477                 asm.emitModRM(dst, src);
 478             } else {
 479                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 480                 asm.emitModRM(dst, src);
 481             }
 482         }
 483 
 484         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 485             assert verify(asm, size, dst, null);
 486             if (isSSEInstruction()) {
 487                 Register nds = Register.None;
 488                 switch (op) {
 489                     case 0x51:
 490                         if ((size == SS) || (size == SD)) {
 491                             nds = dst;
 492                         }
 493                         break;
 494                     case 0x2A:
 495                     case 0x54:
 496                     case 0x55:
 497                     case 0x56:
 498                     case 0x57:
 499                     case 0x58:
 500                     case 0x59:
 501                     case 0x5A:
 502                     case 0x5C:
 503                     case 0x5D:
 504                     case 0x5E:
 505                     case 0x5F:
 506                         nds = dst;
 507                         break;
 508                     default:
 509                         break;
 510                 }
 511                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 512                 asm.emitByte(op);
 513                 asm.emitOperandHelper(dst, src, 0);
 514             } else {
 515                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 516                 asm.emitOperandHelper(dst, src, 0);
 517             }
 518         }
 519     }
 520 
 521     /**
 522      * Opcode with operand order of MR.
 523      */
 524     public static class AMD64MROp extends AMD64RROp {
 525         // @formatter:off
 526         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 527         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 528 
 529         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 530         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 531         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 532         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533 
 534         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 535         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 536         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         // @formatter:on
 538 
 539         protected AMD64MROp(String opcode, int op) {
 540             this(opcode, 0, op);
 541         }
 542 
 543         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 544             this(opcode, 0, op, assertion);
 545         }
 546 
 547         protected AMD64MROp(String opcode, int prefix, int op) {
 548             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 549         }
 550 
 551         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 552             this(opcode, prefix, op, assertion, null);
 553         }
 554 
 555         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 556             this(opcode, 0, prefix, op, assertion, feature);
 557         }
 558 
 559         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 560             super(opcode, prefix1, prefix2, op, assertion, feature);
 561         }
 562 
 563         @Override
 564         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 565             assert verify(asm, size, src, dst);
 566             if (isSSEInstruction()) {
 567                 Register nds = Register.None;
 568                 switch (op) {
 569                     case 0x11:
 570                         if ((size == SS) || (size == SD)) {
 571                             nds = src;
 572                         }
 573                         break;
 574                     default:
 575                         break;
 576                 }
 577                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 578                 asm.emitByte(op);
 579                 asm.emitModRM(src, dst);
 580             } else {
 581                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 582                 asm.emitModRM(src, dst);
 583             }
 584         }
 585 
 586         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 587             assert verify(asm, size, src, null);
 588             if (isSSEInstruction()) {
 589                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 590                 asm.emitByte(op);
 591             } else {
 592                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 593             }
 594             asm.emitOperandHelper(src, dst, 0);
 595         }
 596     }
 597 
 598     /**
 599      * Opcodes with operand order of M.
 600      */
 601     public static class AMD64MOp extends AMD64Op {
 602         // @formatter:off
 603         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 604         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 605         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 606         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 607         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 608         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 609         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 610         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 611         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 612         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 613         // @formatter:on
 614 
 615         private final int ext;
 616 
 617         protected AMD64MOp(String opcode, int op, int ext) {
 618             this(opcode, 0, op, ext);
 619         }
 620 
 621         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 622             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 623         }
 624 
 625         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 626             this(opcode, 0, op, ext, assertion);
 627         }
 628 
 629         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 630             super(opcode, 0, prefix, op, assertion, null);
 631             this.ext = ext;
 632         }
 633 
 634         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 635             assert verify(asm, size, dst, null);
 636             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 637             asm.emitModRM(ext, dst);
 638         }
 639 
 640         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 641             assert verify(asm, size, null, null);
 642             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 643             asm.emitOperandHelper(ext, dst, 0);
 644         }
 645     }
 646 
 647     /**
 648      * Opcodes with operand order of MI.
 649      */
 650     public static class AMD64MIOp extends AMD64ImmOp {
 651         // @formatter:off
 652         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 653         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 654         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 655         // @formatter:on
 656 
 657         private final int ext;
 658 
 659         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 660             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 661         }
 662 
 663         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 664             this(opcode, immIsByte, 0, op, ext, assertion);
 665         }
 666 
 667         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 668             super(opcode, immIsByte, prefix, op, assertion);
 669             this.ext = ext;
 670         }
 671 
 672         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 673             emit(asm, size, dst, imm, false);
 674         }
 675 
 676         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 677             assert verify(asm, size, dst, null);
 678             int insnPos = asm.position();
 679             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 680             asm.emitModRM(ext, dst);
 681             int immPos = asm.position();
 682             emitImmediate(asm, size, imm);
 683             int nextInsnPos = asm.position();
 684             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 685                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 686             }
 687         }
 688 
 689         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 690             emit(asm, size, dst, imm, false);
 691         }
 692 
 693         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 694             assert verify(asm, size, null, null);
 695             int insnPos = asm.position();
 696             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 697             asm.emitOperandHelper(ext, dst, immediateSize(size));
 698             int immPos = asm.position();
 699             emitImmediate(asm, size, imm);
 700             int nextInsnPos = asm.position();
 701             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 702                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 703             }
 704         }
 705     }
 706 
 707     /**
 708      * Opcodes with operand order of RMI.
 709      *
 710      * We only have one form of round as the operation is always treated with single variant input,
 711      * making its extension to 3 address forms redundant.
 712      */
 713     public static class AMD64RMIOp extends AMD64ImmOp {
 714         // @formatter:off
 715         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 716         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 717         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 718         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         // @formatter:on
 720 
 721         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 722             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 723         }
 724 
 725         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 726             super(opcode, immIsByte, prefix, op, assertion, feature);
 727         }
 728 
 729         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 730             assert verify(asm, size, dst, src);
 731             if (isSSEInstruction()) {
 732                 Register nds = Register.None;
 733                 switch (op) {
 734                     case 0x0A:
 735                     case 0x0B:
 736                         nds = dst;
 737                         break;
 738                     default:
 739                         break;
 740                 }
 741                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 742                 asm.emitByte(op);
 743                 asm.emitModRM(dst, src);
 744             } else {
 745                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 746                 asm.emitModRM(dst, src);
 747             }
 748             emitImmediate(asm, size, imm);
 749         }
 750 
 751         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 752             assert verify(asm, size, dst, null);
 753             if (isSSEInstruction()) {
 754                 Register nds = Register.None;
 755                 switch (op) {
 756                     case 0x0A:
 757                     case 0x0B:
 758                         nds = dst;
 759                         break;
 760                     default:
 761                         break;
 762                 }
 763                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 764                 asm.emitByte(op);
 765             } else {
 766                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 767             }
 768             asm.emitOperandHelper(dst, src, immediateSize(size));
 769             emitImmediate(asm, size, imm);
 770         }
 771     }
 772 
 773     public static class SSEOp extends AMD64RMOp {
 774         // @formatter:off
 775         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 776         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 778         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 780         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 781         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 782         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 786         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 787         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 788         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 789         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 790         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 791         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 792         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 793         // @formatter:on
 794 
 795         protected SSEOp(String opcode, int prefix, int op) {
 796             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 797         }
 798 
 799         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 800             this(opcode, 0, prefix, op, assertion);
 801         }
 802 
 803         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 804             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 805         }
 806     }
 807 
 808     /**
 809      * Arithmetic operation with operand order of RM, MR or MI.
 810      */
 811     public static final class AMD64BinaryArithmetic {
 812         // @formatter:off
 813         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 814         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 815         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 816         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 817         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 818         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 819         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 820         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 821         // @formatter:on
 822 
 823         private final AMD64MIOp byteImmOp;
 824         private final AMD64MROp byteMrOp;
 825         private final AMD64RMOp byteRmOp;
 826 
 827         private final AMD64MIOp immOp;
 828         private final AMD64MIOp immSxOp;
 829         private final AMD64MROp mrOp;
 830         private final AMD64RMOp rmOp;
 831 
 832         private AMD64BinaryArithmetic(String opcode, int code) {
 833             int baseOp = code << 3;
 834 
 835             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 836             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 837             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 838 
 839             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 840             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 841             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 842             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 843         }
 844 
 845         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 846             if (size == BYTE) {
 847                 return byteImmOp;
 848             } else if (sx) {
 849                 return immSxOp;
 850             } else {
 851                 return immOp;
 852             }
 853         }
 854 
 855         public AMD64MROp getMROpcode(OperandSize size) {
 856             if (size == BYTE) {
 857                 return byteMrOp;
 858             } else {
 859                 return mrOp;
 860             }
 861         }
 862 
 863         public AMD64RMOp getRMOpcode(OperandSize size) {
 864             if (size == BYTE) {
 865                 return byteRmOp;
 866             } else {
 867                 return rmOp;
 868             }
 869         }
 870     }
 871 
 872     /**
 873      * Shift operation with operand order of M1, MC or MI.
 874      */
 875     public static final class AMD64Shift {
 876         // @formatter:off
 877         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 878         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 879         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 880         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 881         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 882         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 883         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 884         // @formatter:on
 885 
 886         public final AMD64MOp m1Op;
 887         public final AMD64MOp mcOp;
 888         public final AMD64MIOp miOp;
 889 
 890         private AMD64Shift(String opcode, int code) {
 891             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 892             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 893             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 894         }
 895     }
 896 
 897     private enum VEXOpAssertion {
 898         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 899         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 900         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 901         AVX1_128ONLY(CPUFeature.AVX, null),
 902         AVX1_256ONLY(null, CPUFeature.AVX),
 903         AVX2_256ONLY(null, CPUFeature.AVX2),
 904         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 905         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 906         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 907         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 908         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 909         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null);
 910 
 911         private final CPUFeature l128feature;
 912         private final CPUFeature l256feature;
 913 
 914         private final RegisterCategory rCategory;
 915         private final RegisterCategory vCategory;
 916         private final RegisterCategory mCategory;
 917         private final RegisterCategory imm8Category;
 918 
 919         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 920             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 921         }
 922 
 923         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 924             this.l128feature = l128feature;
 925             this.l256feature = l256feature;
 926             this.rCategory = rCategory;
 927             this.vCategory = vCategory;
 928             this.mCategory = mCategory;
 929             this.imm8Category = imm8Category;
 930         }
 931 
 932         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 933             return check(arch, getLFlag(size), r, v, m, null);
 934         }
 935 
 936         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 937             return check(arch, getLFlag(size), r, v, m, imm8);
 938         }
 939 
 940         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 941             switch (l) {
 942                 case L128:
 943                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 944                     break;
 945                 case L256:
 946                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 947                     break;
 948             }
 949             if (r != null) {
 950                 assert r.getRegisterCategory().equals(rCategory);
 951             }
 952             if (v != null) {
 953                 assert v.getRegisterCategory().equals(vCategory);
 954             }
 955             if (m != null) {
 956                 assert m.getRegisterCategory().equals(mCategory);
 957             }
 958             if (imm8 != null) {
 959                 assert imm8.getRegisterCategory().equals(imm8Category);
 960             }
 961             return true;
 962         }
 963 
 964         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 965             switch (avxSize) {
 966                 case XMM:
 967                     return l128feature != null && features.contains(l128feature);
 968                 case YMM:
 969                     return l256feature != null && features.contains(l256feature);
 970                 default:
 971                     throw GraalError.shouldNotReachHere();
 972             }
 973         }
 974     }
 975 
 976     /**
 977      * Base class for VEX-encoded instructions.
 978      */
 979     public static class VexOp {
 980         protected final int pp;
 981         protected final int mmmmm;
 982         protected final int w;
 983         protected final int op;
 984 
 985         private final String opcode;
 986         protected final VEXOpAssertion assertion;
 987 
 988         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 989             this.pp = pp;
 990             this.mmmmm = mmmmm;
 991             this.w = w;
 992             this.op = op;
 993             this.opcode = opcode;
 994             this.assertion = assertion;
 995         }
 996 
 997         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 998             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
 999         }
1000 
1001         @Override
1002         public String toString() {
1003             return opcode;
1004         }
1005     }
1006 
1007     /**
1008      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1009      */
1010     public static class VexRROp extends VexOp {
1011         // @formatter:off
1012         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1013         // @formatter:on
1014 
1015         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1016             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1017         }
1018 
1019         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1020             super(opcode, pp, mmmmm, w, op, assertion);
1021         }
1022 
1023         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1024             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1025             assert op != 0x1A || op != 0x5A;
1026             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1027             asm.emitByte(op);
1028             asm.emitModRM(dst, src);
1029         }
1030     }
1031 
1032     /**
1033      * VEX-encoded instructions with an operand order of RM.
1034      */
1035     public static class VexRMOp extends VexRROp {
1036         // @formatter:off
1037         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1038         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1042         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1044         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1046         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1048         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1049         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1051         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1052         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1056         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1057         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1058         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1059         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1060         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1061         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1062         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1063         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1064         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1065         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1066         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1067         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1068         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1069         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1070         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1074         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1075         // @formatter:on
1076 
1077         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1078             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1079         }
1080 
1081         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1082             super(opcode, pp, mmmmm, w, op, assertion);
1083         }
1084 
1085         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1086             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1087             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1088             asm.emitByte(op);
1089             asm.emitOperandHelper(dst, src, 0);
1090         }
1091     }
1092 
1093     /**
1094      * VEX-encoded move instructions.
1095      * <p>
1096      * These instructions have two opcodes: op is the forward move instruction with an operand order
1097      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1098      */
1099     public static final class VexMoveOp extends VexRMOp {
1100         // @formatter:off
1101         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1102         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1104         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1106         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1110         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         // @formatter:on
1112 
1113         private final int opReverse;
1114 
1115         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1116             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1117         }
1118 
1119         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1120             super(opcode, pp, mmmmm, w, op, assertion);
1121             this.opReverse = opReverse;
1122         }
1123 
1124         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1125             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1126             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1127             asm.emitByte(opReverse);
1128             asm.emitOperandHelper(src, dst, 0);
1129         }
1130 
1131         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1132             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1133             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1134             asm.emitByte(opReverse);
1135             asm.emitModRM(src, dst);
1136         }
1137     }
1138 
1139     public interface VexRRIOp {
1140         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1141     }
1142 
1143     /**
1144      * VEX-encoded instructions with an operand order of RMI.
1145      */
1146     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1147         // @formatter:off
1148         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1149         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1150         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         // @formatter:on
1153 
1154         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1155             super(opcode, pp, mmmmm, w, op, assertion);
1156         }
1157 
1158         @Override
1159         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1160             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1161             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1162             asm.emitByte(op);
1163             asm.emitModRM(dst, src);
1164             asm.emitByte(imm8);
1165         }
1166 
1167         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1168             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1169             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1170             asm.emitByte(op);
1171             asm.emitOperandHelper(dst, src, 1);
1172             asm.emitByte(imm8);
1173         }
1174     }
1175 
1176     /**
1177      * VEX-encoded instructions with an operand order of MRI.
1178      */
1179     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1180         // @formatter:off
1181         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1182         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1183         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1184         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1187         // @formatter:on
1188 
1189         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1190             super(opcode, pp, mmmmm, w, op, assertion);
1191         }
1192 
1193         @Override
1194         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1195             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1196             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1197             asm.emitByte(op);
1198             asm.emitModRM(src, dst);
1199             asm.emitByte(imm8);
1200         }
1201 
1202         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1203             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1204             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1205             asm.emitByte(op);
1206             asm.emitOperandHelper(src, dst, 1);
1207             asm.emitByte(imm8);
1208         }
1209     }
1210 
1211     /**
1212      * VEX-encoded instructions with an operand order of RVMR.
1213      */
1214     public static class VexRVMROp extends VexOp {
1215         // @formatter:off
1216         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1217         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1218         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1219         // @formatter:on
1220 
1221         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1222             super(opcode, pp, mmmmm, w, op, assertion);
1223         }
1224 
1225         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1226             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1227             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1228             asm.emitByte(op);
1229             asm.emitModRM(dst, src2);
1230             asm.emitByte(mask.encoding() << 4);
1231         }
1232 
1233         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1234             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1235             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1236             asm.emitByte(op);
1237             asm.emitOperandHelper(dst, src2, 0);
1238             asm.emitByte(mask.encoding() << 4);
1239         }
1240     }
1241 
1242     /**
1243      * VEX-encoded instructions with an operand order of RVM.
1244      */
1245     public static class VexRVMOp extends VexOp {
1246         // @formatter:off
1247         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1248         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1250         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1252         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1254         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1256         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1260         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1264         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1268         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1272         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1276         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1280         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1282         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1298         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1300         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1304         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1311         // @formatter:on
1312 
1313         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1314             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1315         }
1316 
1317         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1318             super(opcode, pp, mmmmm, w, op, assertion);
1319         }
1320 
1321         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1322             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1323             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1324             asm.emitByte(op);
1325             asm.emitModRM(dst, src2);
1326         }
1327 
1328         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1329             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1330             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1331             asm.emitByte(op);
1332             asm.emitOperandHelper(dst, src2, 0);
1333         }
1334     }
1335 
1336     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1337         // @formatter:off
1338         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1339         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1340         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1341         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1342         // @formatter:on
1343 
1344         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1345             super(opcode, pp, mmmmm, w, op, assertion);
1346         }
1347 
1348         @Override
1349         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1350             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1351             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1352             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1353             asm.emitByte(op);
1354             asm.emitModRM(dst, src2);
1355         }
1356 
1357         @Override
1358         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1359             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1360             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1361             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1362             asm.emitByte(op);
1363             asm.emitOperandHelper(dst, src2, 0);
1364         }
1365     }
1366 
1367     public static final class VexGeneralPurposeRMVOp extends VexOp {
1368         // @formatter:off
1369         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1370         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1371         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1372         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1373         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1374         // @formatter:on
1375 
1376         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1377             super(opcode, pp, mmmmm, w, op, assertion);
1378         }
1379 
1380         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1381             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1382             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1383             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1384             asm.emitByte(op);
1385             asm.emitModRM(dst, src1);
1386         }
1387 
1388         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1389             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1390             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1391             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1392             asm.emitByte(op);
1393             asm.emitOperandHelper(dst, src1, 0);
1394         }
1395     }
1396 
1397     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1398         // @formatter:off
1399         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1400         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1401         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1402         // @formatter:on
1403         private final int ext;
1404 
1405         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1406             super(opcode, pp, mmmmm, w, op, assertion);
1407             this.ext = ext;
1408         }
1409 
1410         @Override
1411         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1412             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1413             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1414             asm.emitByte(op);
1415             asm.emitModRM(ext, src);
1416         }
1417 
1418         @Override
1419         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1420             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1421             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1422             asm.emitByte(op);
1423             asm.emitOperandHelper(ext, src, 0);
1424         }
1425     }
1426 
1427     /**
1428      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1429      */
1430     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1431         // @formatter:off
1432         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1433         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1434         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1435         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1436         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1437         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1438         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1439         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1440         // @formatter:on
1441 
1442         private final int immOp;
1443         private final int r;
1444 
1445         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1446             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1447             this.immOp = immOp;
1448             this.r = r;
1449         }
1450 
1451         @Override
1452         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1453             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1454             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1455             asm.emitByte(immOp);
1456             asm.emitModRM(r, src);
1457             asm.emitByte(imm8);
1458         }
1459     }
1460 
1461     public static final class VexMaskMoveOp extends VexOp {
1462         // @formatter:off
1463         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1464         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1465         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1466         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1467         // @formatter:on
1468 
1469         private final int opReverse;
1470 
1471         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1472             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1473         }
1474 
1475         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1476             super(opcode, pp, mmmmm, w, op, assertion);
1477             this.opReverse = opReverse;
1478         }
1479 
1480         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1481             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1482             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1483             asm.emitByte(op);
1484             asm.emitOperandHelper(dst, src, 0);
1485         }
1486 
1487         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1488             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1489             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1490             asm.emitByte(opReverse);
1491             asm.emitOperandHelper(src, dst, 0);
1492         }
1493     }
1494 
1495     /**
1496      * VEX-encoded instructions with an operand order of RVMI.
1497      */
1498     public static final class VexRVMIOp extends VexOp {
1499         // @formatter:off
1500         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1501         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1502         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1503         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1504         // @formatter:on
1505 
1506         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1507             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1508         }
1509 
1510         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1511             super(opcode, pp, mmmmm, w, op, assertion);
1512         }
1513 
1514         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1515             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1516             assert (imm8 & 0xFF) == imm8;
1517             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1518             asm.emitByte(op);
1519             asm.emitModRM(dst, src2);
1520             asm.emitByte(imm8);
1521         }
1522 
1523         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1524             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1525             assert (imm8 & 0xFF) == imm8;
1526             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1527             asm.emitByte(op);
1528             asm.emitOperandHelper(dst, src2, 1);
1529             asm.emitByte(imm8);
1530         }
1531     }
1532 
1533     /**
1534      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1535      * comparison operator.
1536      */
1537     public static final class VexFloatCompareOp extends VexOp {
1538         // @formatter:off
1539         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1540         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1541         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1542         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1543         // @formatter:on
1544 
1545         public enum Predicate {
1546             EQ_OQ(0x00),
1547             LT_OS(0x01),
1548             LE_OS(0x02),
1549             UNORD_Q(0x03),
1550             NEQ_UQ(0x04),
1551             NLT_US(0x05),
1552             NLE_US(0x06),
1553             ORD_Q(0x07),
1554             EQ_UQ(0x08),
1555             NGE_US(0x09),
1556             NGT_US(0x0a),
1557             FALSE_OQ(0x0b),
1558             NEQ_OQ(0x0c),
1559             GE_OS(0x0d),
1560             GT_OS(0x0e),
1561             TRUE_UQ(0x0f),
1562             EQ_OS(0x10),
1563             LT_OQ(0x11),
1564             LE_OQ(0x12),
1565             UNORD_S(0x13),
1566             NEQ_US(0x14),
1567             NLT_UQ(0x15),
1568             NLE_UQ(0x16),
1569             ORD_S(0x17),
1570             EQ_US(0x18),
1571             NGE_UQ(0x19),
1572             NGT_UQ(0x1a),
1573             FALSE_OS(0x1b),
1574             NEQ_OS(0x1c),
1575             GE_OQ(0x1d),
1576             GT_OQ(0x1e),
1577             TRUE_US(0x1f);
1578 
1579             private int imm8;
1580 
1581             Predicate(int imm8) {
1582                 this.imm8 = imm8;
1583             }
1584 
1585             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1586                 if (unorderedIsTrue) {
1587                     switch (condition) {
1588                         case EQ:
1589                             return EQ_UQ;
1590                         case NE:
1591                             return NEQ_UQ;
1592                         case LT:
1593                             return NGE_UQ;
1594                         case LE:
1595                             return NGT_UQ;
1596                         case GT:
1597                             return NLE_UQ;
1598                         case GE:
1599                             return NLT_UQ;
1600                         default:
1601                             throw GraalError.shouldNotReachHere();
1602                     }
1603                 } else {
1604                     switch (condition) {
1605                         case EQ:
1606                             return EQ_OQ;
1607                         case NE:
1608                             return NEQ_OQ;
1609                         case LT:
1610                             return LT_OQ;
1611                         case LE:
1612                             return LE_OQ;
1613                         case GT:
1614                             return GT_OQ;
1615                         case GE:
1616                             return GE_OQ;
1617                         default:
1618                             throw GraalError.shouldNotReachHere();
1619                     }
1620                 }
1621             }
1622         }
1623 
1624         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1625             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1626         }
1627 
1628         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1629             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1630             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1631             asm.emitByte(op);
1632             asm.emitModRM(dst, src2);
1633             asm.emitByte(p.imm8);
1634         }
1635 
1636         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1637             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1638             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1639             asm.emitByte(op);
1640             asm.emitOperandHelper(dst, src2, 1);
1641             asm.emitByte(p.imm8);
1642         }
1643     }
1644 
1645     public final void addl(AMD64Address dst, int imm32) {
1646         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1647     }
1648 
1649     public final void addl(Register dst, int imm32) {
1650         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1651     }
1652 
1653     public final void addl(Register dst, Register src) {
1654         ADD.rmOp.emit(this, DWORD, dst, src);
1655     }
1656 
1657     public final void addpd(Register dst, Register src) {
1658         SSEOp.ADD.emit(this, PD, dst, src);
1659     }
1660 
1661     public final void addpd(Register dst, AMD64Address src) {
1662         SSEOp.ADD.emit(this, PD, dst, src);
1663     }
1664 
1665     public final void addsd(Register dst, Register src) {
1666         SSEOp.ADD.emit(this, SD, dst, src);
1667     }
1668 
1669     public final void addsd(Register dst, AMD64Address src) {
1670         SSEOp.ADD.emit(this, SD, dst, src);
1671     }
1672 
1673     private void addrNop4() {
1674         // 4 bytes: NOP DWORD PTR [EAX+0]
1675         emitByte(0x0F);
1676         emitByte(0x1F);
1677         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1678         emitByte(0); // 8-bits offset (1 byte)
1679     }
1680 
1681     private void addrNop5() {
1682         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1683         emitByte(0x0F);
1684         emitByte(0x1F);
1685         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1686         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1687         emitByte(0); // 8-bits offset (1 byte)
1688     }
1689 
1690     private void addrNop7() {
1691         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1692         emitByte(0x0F);
1693         emitByte(0x1F);
1694         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1695         emitInt(0); // 32-bits offset (4 bytes)
1696     }
1697 
1698     private void addrNop8() {
1699         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1700         emitByte(0x0F);
1701         emitByte(0x1F);
1702         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1703         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1704         emitInt(0); // 32-bits offset (4 bytes)
1705     }
1706 
1707     public final void andl(Register dst, int imm32) {
1708         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1709     }
1710 
1711     public final void andl(Register dst, Register src) {
1712         AND.rmOp.emit(this, DWORD, dst, src);
1713     }
1714 
1715     public final void andpd(Register dst, Register src) {
1716         SSEOp.AND.emit(this, PD, dst, src);
1717     }
1718 
1719     public final void andpd(Register dst, AMD64Address src) {
1720         SSEOp.AND.emit(this, PD, dst, src);
1721     }
1722 
1723     public final void bsfq(Register dst, Register src) {
1724         prefixq(dst, src);
1725         emitByte(0x0F);
1726         emitByte(0xBC);
1727         emitModRM(dst, src);
1728     }
1729 
1730     public final void bsrl(Register dst, Register src) {
1731         prefix(dst, src);
1732         emitByte(0x0F);
1733         emitByte(0xBD);
1734         emitModRM(dst, src);
1735     }
1736 
1737     public final void bswapl(Register reg) {
1738         prefix(reg);
1739         emitByte(0x0F);
1740         emitModRM(1, reg);
1741     }
1742 
1743     public final void cdql() {
1744         emitByte(0x99);
1745     }
1746 
1747     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1748         prefix(dst, src);
1749         emitByte(0x0F);
1750         emitByte(0x40 | cc.getValue());
1751         emitModRM(dst, src);
1752     }
1753 
1754     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1755         prefix(src, dst);
1756         emitByte(0x0F);
1757         emitByte(0x40 | cc.getValue());
1758         emitOperandHelper(dst, src, 0);
1759     }
1760 
1761     public final void cmpb(Register dst, Register src) {
1762         CMP.byteRmOp.emit(this, BYTE, dst, src);
1763     }
1764 
1765     public final void cmpw(Register dst, Register src) {
1766         CMP.rmOp.emit(this, WORD, dst, src);
1767     }
1768 
1769     public final void cmpl(Register dst, int imm32) {
1770         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1771     }
1772 
1773     public final void cmpl(Register dst, Register src) {
1774         CMP.rmOp.emit(this, DWORD, dst, src);
1775     }
1776 
1777     public final void cmpl(Register dst, AMD64Address src) {
1778         CMP.rmOp.emit(this, DWORD, dst, src);
1779     }
1780 
1781     public final void cmpl(AMD64Address dst, int imm32) {
1782         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1783     }
1784 
1785     /**
1786      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1787      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1788      * values were equal, and cleared otherwise.
1789      */
1790     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1791         prefixb(adr, reg);
1792         emitByte(0x0F);
1793         emitByte(0xB0);
1794         emitOperandHelper(reg, adr, 0);
1795     }
1796 
1797     /**
1798      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1799      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1800      * compared values were equal, and cleared otherwise.
1801      */
1802     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1803         emitByte(0x66); // Switch to 16-bit mode.
1804         prefix(adr, reg);
1805         emitByte(0x0F);
1806         emitByte(0xB1);
1807         emitOperandHelper(reg, adr, 0);
1808     }
1809 
1810     /**
1811      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1812      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1813      * compared values were equal, and cleared otherwise.
1814      */
1815     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1816         prefix(adr, reg);
1817         emitByte(0x0F);
1818         emitByte(0xB1);
1819         emitOperandHelper(reg, adr, 0);
1820     }
1821 
1822     public final void cvtsi2sdl(Register dst, Register src) {
1823         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1824     }
1825 
1826     public final void cvttsd2sil(Register dst, Register src) {
1827         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1828     }
1829 
1830     public final void decl(AMD64Address dst) {
1831         prefix(dst);
1832         emitByte(0xFF);
1833         emitOperandHelper(1, dst, 0);
1834     }
1835 
1836     public final void divsd(Register dst, Register src) {
1837         SSEOp.DIV.emit(this, SD, dst, src);
1838     }
1839 
1840     public final void hlt() {
1841         emitByte(0xF4);
1842     }
1843 
1844     public final void imull(Register dst, Register src, int value) {
1845         if (isByte(value)) {
1846             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1847         } else {
1848             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1849         }
1850     }
1851 
1852     public final void incl(AMD64Address dst) {
1853         prefix(dst);
1854         emitByte(0xFF);
1855         emitOperandHelper(0, dst, 0);
1856     }
1857 
1858     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1859         int shortSize = 2;
1860         int longSize = 6;
1861         long disp = jumpTarget - position();
1862         if (!forceDisp32 && isByte(disp - shortSize)) {
1863             // 0111 tttn #8-bit disp
1864             emitByte(0x70 | cc.getValue());
1865             emitByte((int) ((disp - shortSize) & 0xFF));
1866         } else {
1867             // 0000 1111 1000 tttn #32-bit disp
1868             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1869             emitByte(0x0F);
1870             emitByte(0x80 | cc.getValue());
1871             emitInt((int) (disp - longSize));
1872         }
1873     }
1874 
1875     public final void jcc(ConditionFlag cc, Label l) {
1876         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1877         if (l.isBound()) {
1878             jcc(cc, l.position(), false);
1879         } else {
1880             // Note: could eliminate cond. jumps to this jump if condition
1881             // is the same however, seems to be rather unlikely case.
1882             // Note: use jccb() if label to be bound is very close to get
1883             // an 8-bit displacement
1884             l.addPatchAt(position(), this);
1885             emitByte(0x0F);
1886             emitByte(0x80 | cc.getValue());
1887             emitInt(0);
1888         }
1889 
1890     }
1891 
1892     public final void jccb(ConditionFlag cc, Label l) {
1893         if (l.isBound()) {
1894             int shortSize = 2;
1895             int entry = l.position();
1896             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1897             long disp = entry - position();
1898             // 0111 tttn #8-bit disp
1899             emitByte(0x70 | cc.getValue());
1900             emitByte((int) ((disp - shortSize) & 0xFF));
1901         } else {
1902             l.addPatchAt(position(), this);
1903             emitByte(0x70 | cc.getValue());
1904             emitByte(0);
1905         }
1906     }
1907 
1908     public final void jmp(int jumpTarget, boolean forceDisp32) {
1909         int shortSize = 2;
1910         int longSize = 5;
1911         long disp = jumpTarget - position();
1912         if (!forceDisp32 && isByte(disp - shortSize)) {
1913             emitByte(0xEB);
1914             emitByte((int) ((disp - shortSize) & 0xFF));
1915         } else {
1916             emitByte(0xE9);
1917             emitInt((int) (disp - longSize));
1918         }
1919     }
1920 
1921     @Override
1922     public final void jmp(Label l) {
1923         if (l.isBound()) {
1924             jmp(l.position(), false);
1925         } else {
1926             // By default, forward jumps are always 32-bit displacements, since
1927             // we can't yet know where the label will be bound. If you're sure that
1928             // the forward jump will not run beyond 256 bytes, use jmpb to
1929             // force an 8-bit displacement.
1930 
1931             l.addPatchAt(position(), this);
1932             emitByte(0xE9);
1933             emitInt(0);
1934         }
1935     }
1936 
1937     public final void jmp(Register entry) {
1938         prefix(entry);
1939         emitByte(0xFF);
1940         emitModRM(4, entry);
1941     }
1942 
1943     public final void jmp(AMD64Address adr) {
1944         prefix(adr);
1945         emitByte(0xFF);
1946         emitOperandHelper(AMD64.rsp, adr, 0);
1947     }
1948 
1949     public final void jmpb(Label l) {
1950         if (l.isBound()) {
1951             int shortSize = 2;
1952             // Displacement is relative to byte just after jmpb instruction
1953             int displacement = l.position() - position() - shortSize;
1954             GraalError.guarantee(isByte(displacement), "Displacement too large to be encoded as a byte: %d", displacement);
1955             emitByte(0xEB);
1956             emitByte(displacement & 0xFF);
1957         } else {
1958             l.addPatchAt(position(), this);
1959             emitByte(0xEB);
1960             emitByte(0);
1961         }
1962     }
1963 
1964     public final void lead(Register dst, AMD64Address src) {
1965         prefix(src, dst);
1966         emitByte(0x8D);
1967         emitOperandHelper(dst, src, 0);
1968     }
1969 
1970     public final void leaq(Register dst, AMD64Address src) {
1971         prefixq(src, dst);
1972         emitByte(0x8D);
1973         emitOperandHelper(dst, src, 0);
1974     }
1975 
1976     public final void leave() {
1977         emitByte(0xC9);
1978     }
1979 
1980     public final void lock() {
1981         emitByte(0xF0);
1982     }
1983 
1984     public final void movapd(Register dst, Register src) {
1985         assert inRC(XMM, dst) && inRC(XMM, src);
1986         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1987         emitByte(0x28);
1988         emitModRM(dst, src);
1989     }
1990 
1991     public final void movaps(Register dst, Register src) {
1992         assert inRC(XMM, dst) && inRC(XMM, src);
1993         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1994         emitByte(0x28);
1995         emitModRM(dst, src);
1996     }
1997 
1998     public final void movb(AMD64Address dst, int imm8) {
1999         prefix(dst);
2000         emitByte(0xC6);
2001         emitOperandHelper(0, dst, 1);
2002         emitByte(imm8);
2003     }
2004 
2005     public final void movb(AMD64Address dst, Register src) {
2006         assert inRC(CPU, src) : "must have byte register";
2007         prefixb(dst, src);
2008         emitByte(0x88);
2009         emitOperandHelper(src, dst, 0);
2010     }
2011 
2012     public final void movl(Register dst, int imm32) {
2013         movl(dst, imm32, false);
2014     }
2015 
2016     public final void movl(Register dst, int imm32, boolean annotateImm) {
2017         int insnPos = position();
2018         prefix(dst);
2019         emitByte(0xB8 + encode(dst));
2020         int immPos = position();
2021         emitInt(imm32);
2022         int nextInsnPos = position();
2023         if (annotateImm && codePatchingAnnotationConsumer != null) {
2024             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2025         }
2026     }
2027 
2028     public final void movl(Register dst, Register src) {
2029         prefix(dst, src);
2030         emitByte(0x8B);
2031         emitModRM(dst, src);
2032     }
2033 
2034     public final void movl(Register dst, AMD64Address src) {
2035         prefix(src, dst);
2036         emitByte(0x8B);
2037         emitOperandHelper(dst, src, 0);
2038     }
2039 
2040     /**
2041      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2042      */
2043     public final void movl(Register dst, AMD64Address src, boolean wide) {
2044         prefix(src, dst);
2045         emitByte(0x8B);
2046         emitOperandHelper(dst, src, wide, 0);
2047     }
2048 
2049     public final void movl(AMD64Address dst, int imm32) {
2050         prefix(dst);
2051         emitByte(0xC7);
2052         emitOperandHelper(0, dst, 4);
2053         emitInt(imm32);
2054     }
2055 
2056     public final void movl(AMD64Address dst, Register src) {
2057         prefix(dst, src);
2058         emitByte(0x89);
2059         emitOperandHelper(src, dst, 0);
2060     }
2061 
2062     /**
2063      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2064      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2065      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2066      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2067      */
2068     public final void movlpd(Register dst, AMD64Address src) {
2069         assert inRC(XMM, dst);
2070         simdPrefix(dst, dst, src, PD, P_0F, false);
2071         emitByte(0x12);
2072         emitOperandHelper(dst, src, 0);
2073     }
2074 
2075     public final void movlhps(Register dst, Register src) {
2076         assert inRC(XMM, dst) && inRC(XMM, src);
2077         simdPrefix(dst, src, src, PS, P_0F, false);
2078         emitByte(0x16);
2079         emitModRM(dst, src);
2080     }
2081 
2082     public final void movq(Register dst, AMD64Address src) {
2083         movq(dst, src, false);
2084     }
2085 
2086     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2087         if (inRC(XMM, dst)) {
2088             // Insn: MOVQ xmm, r/m64
2089             // Code: F3 0F 7E /r
2090             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2091             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2092             // when applicable.
2093             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2094             emitByte(0x7E);
2095             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2096         } else {
2097             // gpr version of movq
2098             prefixq(src, dst);
2099             emitByte(0x8B);
2100             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2101         }
2102     }
2103 
2104     public final void movq(Register dst, Register src) {
2105         assert inRC(CPU, dst) && inRC(CPU, src);
2106         prefixq(dst, src);
2107         emitByte(0x8B);
2108         emitModRM(dst, src);
2109     }
2110 
2111     public final void movq(AMD64Address dst, Register src) {
2112         if (inRC(XMM, src)) {
2113             // Insn: MOVQ r/m64, xmm
2114             // Code: 66 0F D6 /r
2115             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2116             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2117             // when applicable.
2118             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2119             emitByte(0xD6);
2120             emitOperandHelper(src, dst, 0);
2121         } else {
2122             // gpr version of movq
2123             prefixq(dst, src);
2124             emitByte(0x89);
2125             emitOperandHelper(src, dst, 0);
2126         }
2127     }
2128 
2129     public final void movsbl(Register dst, AMD64Address src) {
2130         prefix(src, dst);
2131         emitByte(0x0F);
2132         emitByte(0xBE);
2133         emitOperandHelper(dst, src, 0);
2134     }
2135 
2136     public final void movsbl(Register dst, Register src) {
2137         prefix(dst, false, src, true);
2138         emitByte(0x0F);
2139         emitByte(0xBE);
2140         emitModRM(dst, src);
2141     }
2142 
2143     public final void movsbq(Register dst, AMD64Address src) {
2144         prefixq(src, dst);
2145         emitByte(0x0F);
2146         emitByte(0xBE);
2147         emitOperandHelper(dst, src, 0);
2148     }
2149 
2150     public final void movsbq(Register dst, Register src) {
2151         prefixq(dst, src);
2152         emitByte(0x0F);
2153         emitByte(0xBE);
2154         emitModRM(dst, src);
2155     }
2156 
2157     public final void movsd(Register dst, Register src) {
2158         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2159     }
2160 
2161     public final void movsd(Register dst, AMD64Address src) {
2162         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2163     }
2164 
2165     public final void movsd(AMD64Address dst, Register src) {
2166         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2167     }
2168 
2169     public final void movss(Register dst, Register src) {
2170         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2171     }
2172 
2173     public final void movss(Register dst, AMD64Address src) {
2174         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2175     }
2176 
2177     public final void movss(AMD64Address dst, Register src) {
2178         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2179     }
2180 
2181     public final void mulpd(Register dst, Register src) {
2182         SSEOp.MUL.emit(this, PD, dst, src);
2183     }
2184 
2185     public final void mulpd(Register dst, AMD64Address src) {
2186         SSEOp.MUL.emit(this, PD, dst, src);
2187     }
2188 
2189     public final void mulsd(Register dst, Register src) {
2190         SSEOp.MUL.emit(this, SD, dst, src);
2191     }
2192 
2193     public final void mulsd(Register dst, AMD64Address src) {
2194         SSEOp.MUL.emit(this, SD, dst, src);
2195     }
2196 
2197     public final void mulss(Register dst, Register src) {
2198         SSEOp.MUL.emit(this, SS, dst, src);
2199     }
2200 
2201     public final void movswl(Register dst, AMD64Address src) {
2202         AMD64RMOp.MOVSX.emit(this, DWORD, dst, src);
2203     }
2204 
2205     public final void movswq(Register dst, AMD64Address src) {
2206         AMD64RMOp.MOVSX.emit(this, QWORD, dst, src);
2207     }
2208 
2209     public final void movw(AMD64Address dst, int imm16) {
2210         emitByte(0x66); // switch to 16-bit mode
2211         prefix(dst);
2212         emitByte(0xC7);
2213         emitOperandHelper(0, dst, 2);
2214         emitShort(imm16);
2215     }
2216 
2217     public final void movw(AMD64Address dst, Register src) {
2218         emitByte(0x66);
2219         prefix(dst, src);
2220         emitByte(0x89);
2221         emitOperandHelper(src, dst, 0);
2222     }
2223 
2224     public final void movw(Register dst, AMD64Address src) {
2225         emitByte(0x66);
2226         prefix(src, dst);
2227         emitByte(0x8B);
2228         emitOperandHelper(dst, src, 0);
2229     }
2230 
2231     public final void movzbl(Register dst, AMD64Address src) {
2232         prefix(src, dst);
2233         emitByte(0x0F);
2234         emitByte(0xB6);
2235         emitOperandHelper(dst, src, 0);
2236     }
2237 
2238     public final void movzbl(Register dst, Register src) {
2239         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2240     }
2241 
2242     public final void movzbq(Register dst, Register src) {
2243         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2244     }
2245 
2246     public final void movzbq(Register dst, AMD64Address src) {
2247         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2248     }
2249 
2250     public final void movzwl(Register dst, AMD64Address src) {
2251         AMD64RMOp.MOVZX.emit(this, DWORD, dst, src);
2252     }
2253 
2254     public final void movzwq(Register dst, AMD64Address src) {
2255         AMD64RMOp.MOVZX.emit(this, QWORD, dst, src);
2256     }
2257 
2258     public final void negl(Register dst) {
2259         NEG.emit(this, DWORD, dst);
2260     }
2261 
2262     public final void notl(Register dst) {
2263         NOT.emit(this, DWORD, dst);
2264     }
2265 
2266     public final void notq(Register dst) {
2267         NOT.emit(this, QWORD, dst);
2268     }
2269 
2270     @Override
2271     public final void ensureUniquePC() {
2272         nop();
2273     }
2274 
2275     public final void nop() {
2276         nop(1);
2277     }
2278 
2279     public void nop(int count) {
2280         int i = count;
2281         if (UseNormalNop) {
2282             assert i > 0 : " ";
2283             // The fancy nops aren't currently recognized by debuggers making it a
2284             // pain to disassemble code while debugging. If assert are on clearly
2285             // speed is not an issue so simply use the single byte traditional nop
2286             // to do alignment.
2287 
2288             for (; i > 0; i--) {
2289                 emitByte(0x90);
2290             }
2291             return;
2292         }
2293 
2294         if (UseAddressNop) {
2295             //
2296             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2297             // 1: 0x90
2298             // 2: 0x66 0x90
2299             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2300             // 4: 0x0F 0x1F 0x40 0x00
2301             // 5: 0x0F 0x1F 0x44 0x00 0x00
2302             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2303             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2304             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2305             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2306             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2307             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2308 
2309             // The rest coding is AMD specific - use consecutive Address nops
2310 
2311             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2312             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2313             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2314             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2315             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2316             // Size prefixes (0x66) are added for larger sizes
2317 
2318             while (i >= 22) {
2319                 i -= 11;
2320                 emitByte(0x66); // size prefix
2321                 emitByte(0x66); // size prefix
2322                 emitByte(0x66); // size prefix
2323                 addrNop8();
2324             }
2325             // Generate first nop for size between 21-12
2326             switch (i) {
2327                 case 21:
2328                     i -= 11;
2329                     emitByte(0x66); // size prefix
2330                     emitByte(0x66); // size prefix
2331                     emitByte(0x66); // size prefix
2332                     addrNop8();
2333                     break;
2334                 case 20:
2335                 case 19:
2336                     i -= 10;
2337                     emitByte(0x66); // size prefix
2338                     emitByte(0x66); // size prefix
2339                     addrNop8();
2340                     break;
2341                 case 18:
2342                 case 17:
2343                     i -= 9;
2344                     emitByte(0x66); // size prefix
2345                     addrNop8();
2346                     break;
2347                 case 16:
2348                 case 15:
2349                     i -= 8;
2350                     addrNop8();
2351                     break;
2352                 case 14:
2353                 case 13:
2354                     i -= 7;
2355                     addrNop7();
2356                     break;
2357                 case 12:
2358                     i -= 6;
2359                     emitByte(0x66); // size prefix
2360                     addrNop5();
2361                     break;
2362                 default:
2363                     assert i < 12;
2364             }
2365 
2366             // Generate second nop for size between 11-1
2367             switch (i) {
2368                 case 11:
2369                     emitByte(0x66); // size prefix
2370                     emitByte(0x66); // size prefix
2371                     emitByte(0x66); // size prefix
2372                     addrNop8();
2373                     break;
2374                 case 10:
2375                     emitByte(0x66); // size prefix
2376                     emitByte(0x66); // size prefix
2377                     addrNop8();
2378                     break;
2379                 case 9:
2380                     emitByte(0x66); // size prefix
2381                     addrNop8();
2382                     break;
2383                 case 8:
2384                     addrNop8();
2385                     break;
2386                 case 7:
2387                     addrNop7();
2388                     break;
2389                 case 6:
2390                     emitByte(0x66); // size prefix
2391                     addrNop5();
2392                     break;
2393                 case 5:
2394                     addrNop5();
2395                     break;
2396                 case 4:
2397                     addrNop4();
2398                     break;
2399                 case 3:
2400                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2401                     emitByte(0x66); // size prefix
2402                     emitByte(0x66); // size prefix
2403                     emitByte(0x90); // nop
2404                     break;
2405                 case 2:
2406                     emitByte(0x66); // size prefix
2407                     emitByte(0x90); // nop
2408                     break;
2409                 case 1:
2410                     emitByte(0x90); // nop
2411                     break;
2412                 default:
2413                     assert i == 0;
2414             }
2415             return;
2416         }
2417 
2418         // Using nops with size prefixes "0x66 0x90".
2419         // From AMD Optimization Guide:
2420         // 1: 0x90
2421         // 2: 0x66 0x90
2422         // 3: 0x66 0x66 0x90
2423         // 4: 0x66 0x66 0x66 0x90
2424         // 5: 0x66 0x66 0x90 0x66 0x90
2425         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2426         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2427         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2428         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2429         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2430         //
2431         while (i > 12) {
2432             i -= 4;
2433             emitByte(0x66); // size prefix
2434             emitByte(0x66);
2435             emitByte(0x66);
2436             emitByte(0x90); // nop
2437         }
2438         // 1 - 12 nops
2439         if (i > 8) {
2440             if (i > 9) {
2441                 i -= 1;
2442                 emitByte(0x66);
2443             }
2444             i -= 3;
2445             emitByte(0x66);
2446             emitByte(0x66);
2447             emitByte(0x90);
2448         }
2449         // 1 - 8 nops
2450         if (i > 4) {
2451             if (i > 6) {
2452                 i -= 1;
2453                 emitByte(0x66);
2454             }
2455             i -= 3;
2456             emitByte(0x66);
2457             emitByte(0x66);
2458             emitByte(0x90);
2459         }
2460         switch (i) {
2461             case 4:
2462                 emitByte(0x66);
2463                 emitByte(0x66);
2464                 emitByte(0x66);
2465                 emitByte(0x90);
2466                 break;
2467             case 3:
2468                 emitByte(0x66);
2469                 emitByte(0x66);
2470                 emitByte(0x90);
2471                 break;
2472             case 2:
2473                 emitByte(0x66);
2474                 emitByte(0x90);
2475                 break;
2476             case 1:
2477                 emitByte(0x90);
2478                 break;
2479             default:
2480                 assert i == 0;
2481         }
2482     }
2483 
2484     public final void orl(Register dst, Register src) {
2485         OR.rmOp.emit(this, DWORD, dst, src);
2486     }
2487 
2488     public final void orl(Register dst, int imm32) {
2489         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2490     }
2491 
2492     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2493     // -----
2494     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2495 
2496     public final void packuswb(Register dst, Register src) {
2497         assert inRC(XMM, dst) && inRC(XMM, src);
2498         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2499         simdPrefix(dst, dst, src, PD, P_0F, false);
2500         emitByte(0x67);
2501         emitModRM(dst, src);
2502     }
2503 
2504     public final void pop(Register dst) {
2505         prefix(dst);
2506         emitByte(0x58 + encode(dst));
2507     }
2508 
2509     public void popfq() {
2510         emitByte(0x9D);
2511     }
2512 
2513     public final void ptest(Register dst, Register src) {
2514         assert supports(CPUFeature.SSE4_1);
2515         assert inRC(XMM, dst) && inRC(XMM, src);
2516         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2517         emitByte(0x17);
2518         emitModRM(dst, src);
2519     }
2520 
2521     public final void pcmpeqb(Register dst, Register src) {
2522         assert supports(CPUFeature.SSE2);
2523         assert inRC(XMM, dst) && inRC(XMM, src);
2524         simdPrefix(dst, dst, src, PD, P_0F, false);
2525         emitByte(0x74);
2526         emitModRM(dst, src);
2527     }
2528 
2529     public final void pcmpeqw(Register dst, Register src) {
2530         assert supports(CPUFeature.SSE2);
2531         assert inRC(XMM, dst) && inRC(XMM, src);
2532         simdPrefix(dst, dst, src, PD, P_0F, false);
2533         emitByte(0x75);
2534         emitModRM(dst, src);
2535     }
2536 
2537     public final void pcmpeqd(Register dst, Register src) {
2538         assert supports(CPUFeature.SSE2);
2539         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2540         simdPrefix(dst, dst, src, PD, P_0F, false);
2541         emitByte(0x76);
2542         emitModRM(dst, src);
2543     }
2544 
2545     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2546         assert supports(CPUFeature.SSE4_2);
2547         assert inRC(XMM, dst);
2548         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2549         emitByte(0x61);
2550         emitOperandHelper(dst, src, 0);
2551         emitByte(imm8);
2552     }
2553 
2554     public final void pcmpestri(Register dst, Register src, int imm8) {
2555         assert supports(CPUFeature.SSE4_2);
2556         assert inRC(XMM, dst) && inRC(XMM, src);
2557         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2558         emitByte(0x61);
2559         emitModRM(dst, src);
2560         emitByte(imm8);
2561     }
2562 
2563     public final void pmovmskb(Register dst, Register src) {
2564         assert supports(CPUFeature.SSE2);
2565         assert inRC(CPU, dst) && inRC(XMM, src);
2566         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2567         emitByte(0xD7);
2568         emitModRM(dst, src);
2569     }
2570 
2571     private void pmovSZx(Register dst, AMD64Address src, int op) {
2572         assert supports(CPUFeature.SSE4_1);
2573         assert inRC(XMM, dst);
2574         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2575         emitByte(op);
2576         emitOperandHelper(dst, src, 0);
2577     }
2578 
2579     public final void pmovsxbw(Register dst, AMD64Address src) {
2580         pmovSZx(dst, src, 0x20);
2581     }
2582 
2583     public final void pmovsxbd(Register dst, AMD64Address src) {
2584         pmovSZx(dst, src, 0x21);
2585     }
2586 
2587     public final void pmovsxbq(Register dst, AMD64Address src) {
2588         pmovSZx(dst, src, 0x22);
2589     }
2590 
2591     public final void pmovsxwd(Register dst, AMD64Address src) {
2592         pmovSZx(dst, src, 0x23);
2593     }
2594 
2595     public final void pmovsxwq(Register dst, AMD64Address src) {
2596         pmovSZx(dst, src, 0x24);
2597     }
2598 
2599     public final void pmovsxdq(Register dst, AMD64Address src) {
2600         pmovSZx(dst, src, 0x25);
2601     }
2602 
2603     // Insn: VPMOVZXBW xmm1, xmm2/m64
2604     public final void pmovzxbw(Register dst, AMD64Address src) {
2605         pmovSZx(dst, src, 0x30);
2606     }
2607 
2608     public final void pmovzxbd(Register dst, AMD64Address src) {
2609         pmovSZx(dst, src, 0x31);
2610     }
2611 
2612     public final void pmovzxbq(Register dst, AMD64Address src) {
2613         pmovSZx(dst, src, 0x32);
2614     }
2615 
2616     public final void pmovzxwd(Register dst, AMD64Address src) {
2617         pmovSZx(dst, src, 0x33);
2618     }
2619 
2620     public final void pmovzxwq(Register dst, AMD64Address src) {
2621         pmovSZx(dst, src, 0x34);
2622     }
2623 
2624     public final void pmovzxdq(Register dst, AMD64Address src) {
2625         pmovSZx(dst, src, 0x35);
2626     }
2627 
2628     public final void pmovzxbw(Register dst, Register src) {
2629         assert supports(CPUFeature.SSE4_1);
2630         assert inRC(XMM, dst) && inRC(XMM, src);
2631         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2632         emitByte(0x30);
2633         emitModRM(dst, src);
2634     }
2635 
2636     public final void push(Register src) {
2637         prefix(src);
2638         emitByte(0x50 + encode(src));
2639     }
2640 
2641     public void pushfq() {
2642         emitByte(0x9c);
2643     }
2644 
2645     public final void paddd(Register dst, Register src) {
2646         assert inRC(XMM, dst) && inRC(XMM, src);
2647         simdPrefix(dst, dst, src, PD, P_0F, false);
2648         emitByte(0xFE);
2649         emitModRM(dst, src);
2650     }
2651 
2652     public final void paddq(Register dst, Register src) {
2653         assert inRC(XMM, dst) && inRC(XMM, src);
2654         simdPrefix(dst, dst, src, PD, P_0F, false);
2655         emitByte(0xD4);
2656         emitModRM(dst, src);
2657     }
2658 
2659     public final void pextrw(Register dst, Register src, int imm8) {
2660         assert inRC(CPU, dst) && inRC(XMM, src);
2661         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2662         emitByte(0xC5);
2663         emitModRM(dst, src);
2664         emitByte(imm8);
2665     }
2666 
2667     public final void pinsrw(Register dst, Register src, int imm8) {
2668         assert inRC(XMM, dst) && inRC(CPU, src);
2669         simdPrefix(dst, dst, src, PD, P_0F, false);
2670         emitByte(0xC4);
2671         emitModRM(dst, src);
2672         emitByte(imm8);
2673     }
2674 
2675     public final void por(Register dst, Register src) {
2676         assert inRC(XMM, dst) && inRC(XMM, src);
2677         simdPrefix(dst, dst, src, PD, P_0F, false);
2678         emitByte(0xEB);
2679         emitModRM(dst, src);
2680     }
2681 
2682     public final void pand(Register dst, Register src) {
2683         assert inRC(XMM, dst) && inRC(XMM, src);
2684         simdPrefix(dst, dst, src, PD, P_0F, false);
2685         emitByte(0xDB);
2686         emitModRM(dst, src);
2687     }
2688 
2689     public final void pxor(Register dst, Register src) {
2690         assert inRC(XMM, dst) && inRC(XMM, src);
2691         simdPrefix(dst, dst, src, PD, P_0F, false);
2692         emitByte(0xEF);
2693         emitModRM(dst, src);
2694     }
2695 
2696     public final void pslld(Register dst, int imm8) {
2697         assert isUByte(imm8) : "invalid value";
2698         assert inRC(XMM, dst);
2699         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2700         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2701         emitByte(0x72);
2702         emitModRM(6, dst);
2703         emitByte(imm8 & 0xFF);
2704     }
2705 
2706     public final void psllq(Register dst, Register shift) {
2707         assert inRC(XMM, dst) && inRC(XMM, shift);
2708         simdPrefix(dst, dst, shift, PD, P_0F, false);
2709         emitByte(0xF3);
2710         emitModRM(dst, shift);
2711     }
2712 
2713     public final void psllq(Register dst, int imm8) {
2714         assert isUByte(imm8) : "invalid value";
2715         assert inRC(XMM, dst);
2716         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2717         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2718         emitByte(0x73);
2719         emitModRM(6, dst);
2720         emitByte(imm8);
2721     }
2722 
2723     public final void psrad(Register dst, int imm8) {
2724         assert isUByte(imm8) : "invalid value";
2725         assert inRC(XMM, dst);
2726         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2727         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2728         emitByte(0x72);
2729         emitModRM(4, dst);
2730         emitByte(imm8);
2731     }
2732 
2733     public final void psrld(Register dst, int imm8) {
2734         assert isUByte(imm8) : "invalid value";
2735         assert inRC(XMM, dst);
2736         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2737         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2738         emitByte(0x72);
2739         emitModRM(2, dst);
2740         emitByte(imm8);
2741     }
2742 
2743     public final void psrlq(Register dst, int imm8) {
2744         assert isUByte(imm8) : "invalid value";
2745         assert inRC(XMM, dst);
2746         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2747         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2748         emitByte(0x73);
2749         emitModRM(2, dst);
2750         emitByte(imm8);
2751     }
2752 
2753     public final void psrldq(Register dst, int imm8) {
2754         assert isUByte(imm8) : "invalid value";
2755         assert inRC(XMM, dst);
2756         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2757         emitByte(0x73);
2758         emitModRM(3, dst);
2759         emitByte(imm8);
2760     }
2761 
2762     public final void pshufb(Register dst, Register src) {
2763         assert supports(CPUFeature.SSSE3);
2764         assert inRC(XMM, dst) && inRC(XMM, src);
2765         simdPrefix(dst, dst, src, PD, P_0F38, false);
2766         emitByte(0x00);
2767         emitModRM(dst, src);
2768     }
2769 
2770     public final void pshuflw(Register dst, Register src, int imm8) {
2771         assert supports(CPUFeature.SSE2);
2772         assert isUByte(imm8) : "invalid value";
2773         assert inRC(XMM, dst) && inRC(XMM, src);
2774         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2775         emitByte(0x70);
2776         emitModRM(dst, src);
2777         emitByte(imm8);
2778     }
2779 
2780     public final void pshufd(Register dst, Register src, int imm8) {
2781         assert isUByte(imm8) : "invalid value";
2782         assert inRC(XMM, dst) && inRC(XMM, src);
2783         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2784         emitByte(0x70);
2785         emitModRM(dst, src);
2786         emitByte(imm8);
2787     }
2788 
2789     public final void psubd(Register dst, Register src) {
2790         assert inRC(XMM, dst) && inRC(XMM, src);
2791         simdPrefix(dst, dst, src, PD, P_0F, false);
2792         emitByte(0xFA);
2793         emitModRM(dst, src);
2794     }
2795 
2796     public final void punpcklbw(Register dst, Register src) {
2797         assert supports(CPUFeature.SSE2);
2798         assert inRC(XMM, dst) && inRC(XMM, src);
2799         simdPrefix(dst, dst, src, PD, P_0F, false);
2800         emitByte(0x60);
2801         emitModRM(dst, src);
2802     }
2803 
2804     public final void rcpps(Register dst, Register src) {
2805         assert inRC(XMM, dst) && inRC(XMM, src);
2806         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2807         emitByte(0x53);
2808         emitModRM(dst, src);
2809     }
2810 
2811     public final void ret(int imm16) {
2812         if (imm16 == 0) {
2813             emitByte(0xC3);
2814         } else {
2815             emitByte(0xC2);
2816             emitShort(imm16);
2817         }
2818     }
2819 
2820     public final void sarl(Register dst, int imm8) {
2821         prefix(dst);
2822         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2823         if (imm8 == 1) {
2824             emitByte(0xD1);
2825             emitModRM(7, dst);
2826         } else {
2827             emitByte(0xC1);
2828             emitModRM(7, dst);
2829             emitByte(imm8);
2830         }
2831     }
2832 
2833     public final void shll(Register dst, int imm8) {
2834         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2835         prefix(dst);
2836         if (imm8 == 1) {
2837             emitByte(0xD1);
2838             emitModRM(4, dst);
2839         } else {
2840             emitByte(0xC1);
2841             emitModRM(4, dst);
2842             emitByte(imm8);
2843         }
2844     }
2845 
2846     public final void shll(Register dst) {
2847         // Multiply dst by 2, CL times.
2848         prefix(dst);
2849         emitByte(0xD3);
2850         emitModRM(4, dst);
2851     }
2852 
2853     // Insn: SHLX r32a, r/m32, r32b
2854 
2855     public final void shlxl(Register dst, Register src1, Register src2) {
2856         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2857     }
2858 
2859     public final void shrl(Register dst, int imm8) {
2860         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2861         prefix(dst);
2862         emitByte(0xC1);
2863         emitModRM(5, dst);
2864         emitByte(imm8);
2865     }
2866 
2867     public final void shrl(Register dst) {
2868         // Unsigned divide dst by 2, CL times.
2869         prefix(dst);
2870         emitByte(0xD3);
2871         emitModRM(5, dst);
2872     }
2873 
2874     public final void subl(AMD64Address dst, int imm32) {
2875         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2876     }
2877 
2878     public final void subl(Register dst, int imm32) {
2879         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2880     }
2881 
2882     public final void subl(Register dst, Register src) {
2883         SUB.rmOp.emit(this, DWORD, dst, src);
2884     }
2885 
2886     public final void subpd(Register dst, Register src) {
2887         SSEOp.SUB.emit(this, PD, dst, src);
2888     }
2889 
2890     public final void subsd(Register dst, Register src) {
2891         SSEOp.SUB.emit(this, SD, dst, src);
2892     }
2893 
2894     public final void subsd(Register dst, AMD64Address src) {
2895         SSEOp.SUB.emit(this, SD, dst, src);
2896     }
2897 
2898     public final void testl(Register dst, int imm32) {
2899         // not using emitArith because test
2900         // doesn't support sign-extension of
2901         // 8bit operands
2902         if (dst.encoding == 0) {
2903             emitByte(0xA9);
2904         } else {
2905             prefix(dst);
2906             emitByte(0xF7);
2907             emitModRM(0, dst);
2908         }
2909         emitInt(imm32);
2910     }
2911 
2912     public final void testl(Register dst, Register src) {
2913         prefix(dst, src);
2914         emitByte(0x85);
2915         emitModRM(dst, src);
2916     }
2917 
2918     public final void testl(Register dst, AMD64Address src) {
2919         prefix(src, dst);
2920         emitByte(0x85);
2921         emitOperandHelper(dst, src, 0);
2922     }
2923 
2924     public final void unpckhpd(Register dst, Register src) {
2925         assert inRC(XMM, dst) && inRC(XMM, src);
2926         simdPrefix(dst, dst, src, PD, P_0F, false);
2927         emitByte(0x15);
2928         emitModRM(dst, src);
2929     }
2930 
2931     public final void unpcklpd(Register dst, Register src) {
2932         assert inRC(XMM, dst) && inRC(XMM, src);
2933         simdPrefix(dst, dst, src, PD, P_0F, false);
2934         emitByte(0x14);
2935         emitModRM(dst, src);
2936     }
2937 
2938     public final void xorl(Register dst, Register src) {
2939         XOR.rmOp.emit(this, DWORD, dst, src);
2940     }
2941 
2942     public final void xorq(Register dst, Register src) {
2943         XOR.rmOp.emit(this, QWORD, dst, src);
2944     }
2945 
2946     public final void xorpd(Register dst, Register src) {
2947         SSEOp.XOR.emit(this, PD, dst, src);
2948     }
2949 
2950     public final void xorps(Register dst, Register src) {
2951         SSEOp.XOR.emit(this, PS, dst, src);
2952     }
2953 
2954     protected final void decl(Register dst) {
2955         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2956         prefix(dst);
2957         emitByte(0xFF);
2958         emitModRM(1, dst);
2959     }
2960 
2961     protected final void incl(Register dst) {
2962         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2963         prefix(dst);
2964         emitByte(0xFF);
2965         emitModRM(0, dst);
2966     }
2967 
2968     public final void addq(Register dst, int imm32) {
2969         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2970     }
2971 
2972     public final void addq(AMD64Address dst, int imm32) {
2973         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2974     }
2975 
2976     public final void addq(Register dst, Register src) {
2977         ADD.rmOp.emit(this, QWORD, dst, src);
2978     }
2979 
2980     public final void addq(AMD64Address dst, Register src) {
2981         ADD.mrOp.emit(this, QWORD, dst, src);
2982     }
2983 
2984     public final void andq(Register dst, int imm32) {
2985         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2986     }
2987 
2988     public final void bsrq(Register dst, Register src) {
2989         prefixq(dst, src);
2990         emitByte(0x0F);
2991         emitByte(0xBD);
2992         emitModRM(dst, src);
2993     }
2994 
2995     public final void bswapq(Register reg) {
2996         prefixq(reg);
2997         emitByte(0x0F);
2998         emitByte(0xC8 + encode(reg));
2999     }
3000 
3001     public final void cdqq() {
3002         rexw();
3003         emitByte(0x99);
3004     }
3005 
3006     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
3007         prefixq(dst, src);
3008         emitByte(0x0F);
3009         emitByte(0x40 | cc.getValue());
3010         emitModRM(dst, src);
3011     }
3012 
3013     public final void setb(ConditionFlag cc, Register dst) {
3014         prefix(dst, true);
3015         emitByte(0x0F);
3016         emitByte(0x90 | cc.getValue());
3017         emitModRM(0, dst);
3018     }
3019 
3020     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
3021         prefixq(src, dst);
3022         emitByte(0x0F);
3023         emitByte(0x40 | cc.getValue());
3024         emitOperandHelper(dst, src, 0);
3025     }
3026 
3027     public final void cmpq(Register dst, int imm32) {
3028         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3029     }
3030 
3031     public final void cmpq(Register dst, Register src) {
3032         CMP.rmOp.emit(this, QWORD, dst, src);
3033     }
3034 
3035     public final void cmpq(Register dst, AMD64Address src) {
3036         CMP.rmOp.emit(this, QWORD, dst, src);
3037     }
3038 
3039     public final void cmpxchgq(Register reg, AMD64Address adr) {
3040         prefixq(adr, reg);
3041         emitByte(0x0F);
3042         emitByte(0xB1);
3043         emitOperandHelper(reg, adr, 0);
3044     }
3045 
3046     public final void cvtdq2pd(Register dst, Register src) {
3047         assert inRC(XMM, dst) && inRC(XMM, src);
3048         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3049         emitByte(0xE6);
3050         emitModRM(dst, src);
3051     }
3052 
3053     public final void cvtsi2sdq(Register dst, Register src) {
3054         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
3055     }
3056 
3057     public final void cvttsd2siq(Register dst, Register src) {
3058         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
3059     }
3060 
3061     public final void cvttpd2dq(Register dst, Register src) {
3062         assert inRC(XMM, dst) && inRC(XMM, src);
3063         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3064         emitByte(0xE6);
3065         emitModRM(dst, src);
3066     }
3067 
3068     public final void decq(Register dst) {
3069         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3070         prefixq(dst);
3071         emitByte(0xFF);
3072         emitModRM(1, dst);
3073     }
3074 
3075     public final void decq(AMD64Address dst) {
3076         DEC.emit(this, QWORD, dst);
3077     }
3078 
3079     public final void imulq(Register dst, Register src) {
3080         prefixq(dst, src);
3081         emitByte(0x0F);
3082         emitByte(0xAF);
3083         emitModRM(dst, src);
3084     }
3085 
3086     public final void incq(Register dst) {
3087         // Don't use it directly. Use Macroincrementq() instead.
3088         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3089         prefixq(dst);
3090         emitByte(0xFF);
3091         emitModRM(0, dst);
3092     }
3093 
3094     public final void incq(AMD64Address dst) {
3095         INC.emit(this, QWORD, dst);
3096     }
3097 
3098     public final void movq(Register dst, long imm64) {
3099         movq(dst, imm64, false);
3100     }
3101 
3102     public final void movq(Register dst, long imm64, boolean annotateImm) {
3103         int insnPos = position();
3104         prefixq(dst);
3105         emitByte(0xB8 + encode(dst));
3106         int immPos = position();
3107         emitLong(imm64);
3108         int nextInsnPos = position();
3109         if (annotateImm && codePatchingAnnotationConsumer != null) {
3110             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3111         }
3112     }
3113 
3114     public final void movslq(Register dst, int imm32) {
3115         prefixq(dst);
3116         emitByte(0xC7);
3117         emitModRM(0, dst);
3118         emitInt(imm32);
3119     }
3120 
3121     public final void movdq(Register dst, AMD64Address src) {
3122         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3123     }
3124 
3125     public final void movdq(AMD64Address dst, Register src) {
3126         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3127     }
3128 
3129     public final void movdq(Register dst, Register src) {
3130         if (inRC(XMM, dst) && inRC(CPU, src)) {
3131             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3132         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3133             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3134         } else {
3135             throw new InternalError("should not reach here");
3136         }
3137     }
3138 
3139     public final void movdl(Register dst, Register src) {
3140         if (inRC(XMM, dst) && inRC(CPU, src)) {
3141             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3142         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3143             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3144         } else {
3145             throw new InternalError("should not reach here");
3146         }
3147     }
3148 
3149     public final void movdl(Register dst, AMD64Address src) {
3150         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3151     }
3152 
3153     public final void movddup(Register dst, Register src) {
3154         assert supports(CPUFeature.SSE3);
3155         assert inRC(XMM, dst) && inRC(XMM, src);
3156         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3157         emitByte(0x12);
3158         emitModRM(dst, src);
3159     }
3160 
3161     public final void movdqu(Register dst, AMD64Address src) {
3162         assert inRC(XMM, dst);
3163         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3164         emitByte(0x6F);
3165         emitOperandHelper(dst, src, 0);
3166     }
3167 
3168     public final void movdqu(Register dst, Register src) {
3169         assert inRC(XMM, dst) && inRC(XMM, src);
3170         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3171         emitByte(0x6F);
3172         emitModRM(dst, src);
3173     }
3174 
3175     // Insn: VMOVDQU xmm2/m128, xmm1
3176 
3177     public final void movdqu(AMD64Address dst, Register src) {
3178         assert inRC(XMM, src);
3179         // Code: VEX.128.F3.0F.WIG 7F /r
3180         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3181         emitByte(0x7F);
3182         emitOperandHelper(src, dst, 0);
3183     }
3184 
3185     public final void movslq(AMD64Address dst, int imm32) {
3186         prefixq(dst);
3187         emitByte(0xC7);
3188         emitOperandHelper(0, dst, 4);
3189         emitInt(imm32);
3190     }
3191 
3192     public final void movslq(Register dst, AMD64Address src) {
3193         prefixq(src, dst);
3194         emitByte(0x63);
3195         emitOperandHelper(dst, src, 0);
3196     }
3197 
3198     public final void movslq(Register dst, Register src) {
3199         prefixq(dst, src);
3200         emitByte(0x63);
3201         emitModRM(dst, src);
3202     }
3203 
3204     public final void negq(Register dst) {
3205         prefixq(dst);
3206         emitByte(0xF7);
3207         emitModRM(3, dst);
3208     }
3209 
3210     public final void orq(Register dst, Register src) {
3211         OR.rmOp.emit(this, QWORD, dst, src);
3212     }
3213 
3214     public final void shlq(Register dst, int imm8) {
3215         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3216         prefixq(dst);
3217         if (imm8 == 1) {
3218             emitByte(0xD1);
3219             emitModRM(4, dst);
3220         } else {
3221             emitByte(0xC1);
3222             emitModRM(4, dst);
3223             emitByte(imm8);
3224         }
3225     }
3226 
3227     public final void shlq(Register dst) {
3228         // Multiply dst by 2, CL times.
3229         prefixq(dst);
3230         emitByte(0xD3);
3231         emitModRM(4, dst);
3232     }
3233 
3234     public final void shrq(Register dst, int imm8) {
3235         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3236         prefixq(dst);
3237         if (imm8 == 1) {
3238             emitByte(0xD1);
3239             emitModRM(5, dst);
3240         } else {
3241             emitByte(0xC1);
3242             emitModRM(5, dst);
3243             emitByte(imm8);
3244         }
3245     }
3246 
3247     public final void shrq(Register dst) {
3248         prefixq(dst);
3249         emitByte(0xD3);
3250         // Unsigned divide dst by 2, CL times.
3251         emitModRM(5, dst);
3252     }
3253 
3254     public final void sarq(Register dst, int imm8) {
3255         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3256         prefixq(dst);
3257         if (imm8 == 1) {
3258             emitByte(0xD1);
3259             emitModRM(7, dst);
3260         } else {
3261             emitByte(0xC1);
3262             emitModRM(7, dst);
3263             emitByte(imm8);
3264         }
3265     }
3266 
3267     public final void sbbq(Register dst, Register src) {
3268         SBB.rmOp.emit(this, QWORD, dst, src);
3269     }
3270 
3271     public final void subq(Register dst, int imm32) {
3272         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3273     }
3274 
3275     public final void subq(AMD64Address dst, int imm32) {
3276         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3277     }
3278 
3279     public final void subqWide(Register dst, int imm32) {
3280         // don't use the sign-extending version, forcing a 32-bit immediate
3281         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3282     }
3283 
3284     public final void subq(Register dst, Register src) {
3285         SUB.rmOp.emit(this, QWORD, dst, src);
3286     }
3287 
3288     public final void testq(Register dst, Register src) {
3289         prefixq(dst, src);
3290         emitByte(0x85);
3291         emitModRM(dst, src);
3292     }
3293 
3294     public final void btrq(Register src, int imm8) {
3295         prefixq(src);
3296         emitByte(0x0F);
3297         emitByte(0xBA);
3298         emitModRM(6, src);
3299         emitByte(imm8);
3300     }
3301 
3302     public final void xaddb(AMD64Address dst, Register src) {
3303         prefixb(dst, src);
3304         emitByte(0x0F);
3305         emitByte(0xC0);
3306         emitOperandHelper(src, dst, 0);
3307     }
3308 
3309     public final void xaddw(AMD64Address dst, Register src) {
3310         emitByte(0x66); // Switch to 16-bit mode.
3311         prefix(dst, src);
3312         emitByte(0x0F);
3313         emitByte(0xC1);
3314         emitOperandHelper(src, dst, 0);
3315     }
3316 
3317     public final void xaddl(AMD64Address dst, Register src) {
3318         prefix(dst, src);
3319         emitByte(0x0F);
3320         emitByte(0xC1);
3321         emitOperandHelper(src, dst, 0);
3322     }
3323 
3324     public final void xaddq(AMD64Address dst, Register src) {
3325         prefixq(dst, src);
3326         emitByte(0x0F);
3327         emitByte(0xC1);
3328         emitOperandHelper(src, dst, 0);
3329     }
3330 
3331     public final void xchgb(Register dst, AMD64Address src) {
3332         prefixb(src, dst);
3333         emitByte(0x86);
3334         emitOperandHelper(dst, src, 0);
3335     }
3336 
3337     public final void xchgw(Register dst, AMD64Address src) {
3338         emitByte(0x66);
3339         prefix(src, dst);
3340         emitByte(0x87);
3341         emitOperandHelper(dst, src, 0);
3342     }
3343 
3344     public final void xchgl(Register dst, AMD64Address src) {
3345         prefix(src, dst);
3346         emitByte(0x87);
3347         emitOperandHelper(dst, src, 0);
3348     }
3349 
3350     public final void xchgq(Register dst, AMD64Address src) {
3351         prefixq(src, dst);
3352         emitByte(0x87);
3353         emitOperandHelper(dst, src, 0);
3354     }
3355 
3356     public final void membar(int barriers) {
3357         if (target.isMP) {
3358             // We only have to handle StoreLoad
3359             if ((barriers & STORE_LOAD) != 0) {
3360                 // All usable chips support "locked" instructions which suffice
3361                 // as barriers, and are much faster than the alternative of
3362                 // using cpuid instruction. We use here a locked add [rsp],0.
3363                 // This is conveniently otherwise a no-op except for blowing
3364                 // flags.
3365                 // Any change to this code may need to revisit other places in
3366                 // the code where this idiom is used, in particular the
3367                 // orderAccess code.
3368                 lock();
3369                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3370             }
3371         }
3372     }
3373 
3374     @Override
3375     protected final void patchJumpTarget(int branch, int branchTarget) {
3376         int op = getByte(branch);
3377         assert op == 0xE8 // call
3378                         || op == 0x00 // jump table entry
3379                         || op == 0xE9 // jmp
3380                         || op == 0xEB // short jmp
3381                         || (op & 0xF0) == 0x70 // short jcc
3382                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3383         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3384 
3385         if (op == 0x00) {
3386             int offsetToJumpTableBase = getShort(branch + 1);
3387             int jumpTableBase = branch - offsetToJumpTableBase;
3388             int imm32 = branchTarget - jumpTableBase;
3389             emitInt(imm32, branch);
3390         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3391 
3392             // short offset operators (jmp and jcc)
3393             final int imm8 = branchTarget - (branch + 2);
3394             /*
3395              * Since a wrongly patched short branch can potentially lead to working but really bad
3396              * behaving code we should always fail with an exception instead of having an assert.
3397              */
3398             GraalError.guarantee(isByte(imm8), "Displacement too large to be encoded as a byte: %d", imm8);
3399             emitByte(imm8, branch + 1);
3400 
3401         } else {
3402 
3403             int off = 1;
3404             if (op == 0x0F) {
3405                 off = 2;
3406             }
3407 
3408             int imm32 = branchTarget - (branch + 4 + off);
3409             emitInt(imm32, branch + off);
3410         }
3411     }
3412 
3413     public void nullCheck(AMD64Address address) {
3414         testl(AMD64.rax, address);
3415     }
3416 
3417     @Override
3418     public void align(int modulus) {
3419         if (position() % modulus != 0) {
3420             nop(modulus - (position() % modulus));
3421         }
3422     }
3423 
3424     /**
3425      * Emits a direct call instruction. Note that the actual call target is not specified, because
3426      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3427      * responsible to add the call address to the appropriate patching tables.
3428      */
3429     public final void call() {
3430         annotatePatchingImmediate(1, 4);
3431         emitByte(0xE8);
3432         emitInt(0);
3433     }
3434 
3435     public final void call(Register src) {
3436         prefix(src);
3437         emitByte(0xFF);
3438         emitModRM(2, src);
3439     }
3440 
3441     public final void int3() {
3442         emitByte(0xCC);
3443     }
3444 
3445     public final void pause() {
3446         emitByte(0xF3);
3447         emitByte(0x90);
3448     }
3449 
3450     private void emitx87(int b1, int b2, int i) {
3451         assert 0 <= i && i < 8 : "illegal stack offset";
3452         emitByte(b1);
3453         emitByte(b2 + i);
3454     }
3455 
3456     public final void fldd(AMD64Address src) {
3457         emitByte(0xDD);
3458         emitOperandHelper(0, src, 0);
3459     }
3460 
3461     public final void flds(AMD64Address src) {
3462         emitByte(0xD9);
3463         emitOperandHelper(0, src, 0);
3464     }
3465 
3466     public final void fldln2() {
3467         emitByte(0xD9);
3468         emitByte(0xED);
3469     }
3470 
3471     public final void fldlg2() {
3472         emitByte(0xD9);
3473         emitByte(0xEC);
3474     }
3475 
3476     public final void fyl2x() {
3477         emitByte(0xD9);
3478         emitByte(0xF1);
3479     }
3480 
3481     public final void fstps(AMD64Address src) {
3482         emitByte(0xD9);
3483         emitOperandHelper(3, src, 0);
3484     }
3485 
3486     public final void fstpd(AMD64Address src) {
3487         emitByte(0xDD);
3488         emitOperandHelper(3, src, 0);
3489     }
3490 
3491     private void emitFPUArith(int b1, int b2, int i) {
3492         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3493         emitByte(b1);
3494         emitByte(b2 + i);
3495     }
3496 
3497     public void ffree(int i) {
3498         emitFPUArith(0xDD, 0xC0, i);
3499     }
3500 
3501     public void fincstp() {
3502         emitByte(0xD9);
3503         emitByte(0xF7);
3504     }
3505 
3506     public void fxch(int i) {
3507         emitFPUArith(0xD9, 0xC8, i);
3508     }
3509 
3510     public void fnstswAX() {
3511         emitByte(0xDF);
3512         emitByte(0xE0);
3513     }
3514 
3515     public void fwait() {
3516         emitByte(0x9B);
3517     }
3518 
3519     public void fprem() {
3520         emitByte(0xD9);
3521         emitByte(0xF8);
3522     }
3523 
3524     public final void fsin() {
3525         emitByte(0xD9);
3526         emitByte(0xFE);
3527     }
3528 
3529     public final void fcos() {
3530         emitByte(0xD9);
3531         emitByte(0xFF);
3532     }
3533 
3534     public final void fptan() {
3535         emitByte(0xD9);
3536         emitByte(0xF2);
3537     }
3538 
3539     public final void fstp(int i) {
3540         emitx87(0xDD, 0xD8, i);
3541     }
3542 
3543     @Override
3544     public AMD64Address makeAddress(Register base, int displacement) {
3545         return new AMD64Address(base, displacement);
3546     }
3547 
3548     @Override
3549     public AMD64Address getPlaceholder(int instructionStartPosition) {
3550         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3551     }
3552 
3553     private void prefetchPrefix(AMD64Address src) {
3554         prefix(src);
3555         emitByte(0x0F);
3556     }
3557 
3558     public void prefetchnta(AMD64Address src) {
3559         prefetchPrefix(src);
3560         emitByte(0x18);
3561         emitOperandHelper(0, src, 0);
3562     }
3563 
3564     void prefetchr(AMD64Address src) {
3565         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3566         prefetchPrefix(src);
3567         emitByte(0x0D);
3568         emitOperandHelper(0, src, 0);
3569     }
3570 
3571     public void prefetcht0(AMD64Address src) {
3572         assert supports(CPUFeature.SSE);
3573         prefetchPrefix(src);
3574         emitByte(0x18);
3575         emitOperandHelper(1, src, 0);
3576     }
3577 
3578     public void prefetcht1(AMD64Address src) {
3579         assert supports(CPUFeature.SSE);
3580         prefetchPrefix(src);
3581         emitByte(0x18);
3582         emitOperandHelper(2, src, 0);
3583     }
3584 
3585     public void prefetcht2(AMD64Address src) {
3586         assert supports(CPUFeature.SSE);
3587         prefix(src);
3588         emitByte(0x0f);
3589         emitByte(0x18);
3590         emitOperandHelper(3, src, 0);
3591     }
3592 
3593     public void prefetchw(AMD64Address src) {
3594         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3595         prefix(src);
3596         emitByte(0x0f);
3597         emitByte(0x0D);
3598         emitOperandHelper(1, src, 0);
3599     }
3600 
3601     public void rdtsc() {
3602         emitByte(0x0F);
3603         emitByte(0x31);
3604     }
3605 
3606     /**
3607      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3608      * to crash the program (debugging etc.).
3609      */
3610     public void illegal() {
3611         emitByte(0x0f);
3612         emitByte(0x0b);
3613     }
3614 
3615     public void lfence() {
3616         emitByte(0x0f);
3617         emitByte(0xae);
3618         emitByte(0xe8);
3619     }
3620 
3621     public final void vptest(Register dst, Register src) {
3622         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3623     }
3624 
3625     public final void vpxor(Register dst, Register nds, Register src) {
3626         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3627     }
3628 
3629     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3630         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3631     }
3632 
3633     public final void vmovdqu(Register dst, AMD64Address src) {
3634         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3635     }
3636 
3637     public final void vmovdqu(AMD64Address dst, Register src) {
3638         assert inRC(XMM, src);
3639         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3640     }
3641 
3642     public final void vpmovzxbw(Register dst, AMD64Address src) {
3643         assert supports(CPUFeature.AVX2);
3644         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3645     }
3646 
3647     public final void vzeroupper() {
3648         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3649         emitByte(0x77);
3650     }
3651 
3652     // Insn: KORTESTD k1, k2
3653 
3654     // This instruction produces ZF or CF flags
3655     public final void kortestd(Register src1, Register src2) {
3656         assert supports(CPUFeature.AVX512BW);
3657         assert inRC(MASK, src1) && inRC(MASK, src2);
3658         // Code: VEX.L0.66.0F.W1 98 /r
3659         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3660         emitByte(0x98);
3661         emitModRM(src1, src2);
3662     }
3663 
3664     // Insn: KORTESTQ k1, k2
3665 
3666     // This instruction produces ZF or CF flags
3667     public final void kortestq(Register src1, Register src2) {
3668         assert supports(CPUFeature.AVX512BW);
3669         assert inRC(MASK, src1) && inRC(MASK, src2);
3670         // Code: VEX.L0.0F.W1 98 /r
3671         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3672         emitByte(0x98);
3673         emitModRM(src1, src2);
3674     }
3675 
3676     public final void kmovd(Register dst, Register src) {
3677         assert supports(CPUFeature.AVX512BW);
3678         assert inRC(MASK, dst) || inRC(CPU, dst);
3679         assert inRC(MASK, src) || inRC(CPU, src);
3680         assert !(inRC(CPU, dst) && inRC(CPU, src));
3681 
3682         if (inRC(MASK, dst)) {
3683             if (inRC(MASK, src)) {
3684                 // kmovd(KRegister dst, KRegister src):
3685                 // Insn: KMOVD k1, k2/m32
3686                 // Code: VEX.L0.66.0F.W1 90 /r
3687                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3688                 emitByte(0x90);
3689                 emitModRM(dst, src);
3690             } else {
3691                 // kmovd(KRegister dst, Register src)
3692                 // Insn: KMOVD k1, r32
3693                 // Code: VEX.L0.F2.0F.W0 92 /r
3694                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3695                 emitByte(0x92);
3696                 emitModRM(dst, src);
3697             }
3698         } else {
3699             if (inRC(MASK, src)) {
3700                 // kmovd(Register dst, KRegister src)
3701                 // Insn: KMOVD r32, k1
3702                 // Code: VEX.L0.F2.0F.W0 93 /r
3703                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3704                 emitByte(0x93);
3705                 emitModRM(dst, src);
3706             } else {
3707                 throw GraalError.shouldNotReachHere();
3708             }
3709         }
3710     }
3711 
3712     public final void kmovq(Register dst, Register src) {
3713         assert supports(CPUFeature.AVX512BW);
3714         assert inRC(MASK, dst) || inRC(CPU, dst);
3715         assert inRC(MASK, src) || inRC(CPU, src);
3716         assert !(inRC(CPU, dst) && inRC(CPU, src));
3717 
3718         if (inRC(MASK, dst)) {
3719             if (inRC(MASK, src)) {
3720                 // kmovq(KRegister dst, KRegister src):
3721                 // Insn: KMOVQ k1, k2/m64
3722                 // Code: VEX.L0.0F.W1 90 /r
3723                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3724                 emitByte(0x90);
3725                 emitModRM(dst, src);
3726             } else {
3727                 // kmovq(KRegister dst, Register src)
3728                 // Insn: KMOVQ k1, r64
3729                 // Code: VEX.L0.F2.0F.W1 92 /r
3730                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3731                 emitByte(0x92);
3732                 emitModRM(dst, src);
3733             }
3734         } else {
3735             if (inRC(MASK, src)) {
3736                 // kmovq(Register dst, KRegister src)
3737                 // Insn: KMOVQ r64, k1
3738                 // Code: VEX.L0.F2.0F.W1 93 /r
3739                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3740                 emitByte(0x93);
3741                 emitModRM(dst, src);
3742             } else {
3743                 throw GraalError.shouldNotReachHere();
3744             }
3745         }
3746     }
3747 
3748     // Insn: KTESTD k1, k2
3749 
3750     public final void ktestd(Register src1, Register src2) {
3751         assert supports(CPUFeature.AVX512BW);
3752         assert inRC(MASK, src1) && inRC(MASK, src2);
3753         // Code: VEX.L0.66.0F.W1 99 /r
3754         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3755         emitByte(0x99);
3756         emitModRM(src1, src2);
3757     }
3758 
3759     public final void evmovdqu64(Register dst, AMD64Address src) {
3760         assert supports(CPUFeature.AVX512F);
3761         assert inRC(XMM, dst);
3762         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3763         emitByte(0x6F);
3764         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3765     }
3766 
3767     // Insn: VPMOVZXBW zmm1, m256
3768 
3769     public final void evpmovzxbw(Register dst, AMD64Address src) {
3770         assert supports(CPUFeature.AVX512BW);
3771         assert inRC(XMM, dst);
3772         // Code: EVEX.512.66.0F38.WIG 30 /r
3773         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3774         emitByte(0x30);
3775         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3776     }
3777 
3778     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3779         assert supports(CPUFeature.AVX512BW);
3780         assert inRC(MASK, kdst) && inRC(XMM, nds);
3781         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3782         emitByte(0x74);
3783         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3784     }
3785 
3786     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3787     // -----
3788     // Insn: VMOVDQU16 zmm1, m512
3789 
3790     public final void evmovdqu16(Register dst, AMD64Address src) {
3791         assert supports(CPUFeature.AVX512BW);
3792         assert inRC(XMM, dst);
3793         // Code: EVEX.512.F2.0F.W1 6F /r
3794         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3795         emitByte(0x6F);
3796         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3797     }
3798 
3799     // Insn: VMOVDQU16 zmm1, k1:z, m512
3800 
3801     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3802         assert supports(CPUFeature.AVX512BW);
3803         assert inRC(XMM, dst) && inRC(MASK, mask);
3804         // Code: EVEX.512.F2.0F.W1 6F /r
3805         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3806         emitByte(0x6F);
3807         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3808     }
3809 
3810     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3811     // -----
3812     // Insn: VMOVDQU16 m512, zmm1
3813 
3814     public final void evmovdqu16(AMD64Address dst, Register src) {
3815         assert supports(CPUFeature.AVX512BW);
3816         assert inRC(XMM, src);
3817         // Code: EVEX.512.F2.0F.W1 7F /r
3818         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3819         emitByte(0x7F);
3820         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3821     }
3822 
3823     // Insn: VMOVDQU16 m512, k1, zmm1
3824 
3825     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3826         assert supports(CPUFeature.AVX512BW);
3827         assert inRC(MASK, mask) && inRC(XMM, src);
3828         // Code: EVEX.512.F2.0F.W1 7F /r
3829         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3830         emitByte(0x7F);
3831         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3832     }
3833 
3834     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3835     // -----
3836     // Insn: VPBROADCASTW zmm1, reg
3837 
3838     public final void evpbroadcastw(Register dst, Register src) {
3839         assert supports(CPUFeature.AVX512BW);
3840         assert inRC(XMM, dst) && inRC(CPU, src);
3841         // Code: EVEX.512.66.0F38.W0 7B /r
3842         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3843         emitByte(0x7B);
3844         emitModRM(dst, src);
3845     }
3846 
3847     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3848     // -----
3849     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3850 
3851     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3852         assert supports(CPUFeature.AVX512BW);
3853         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3854         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3855         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3856         emitByte(0x3E);
3857         emitModRM(kdst, src);
3858         emitByte(vcc);
3859     }
3860 
3861     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3862     // -----
3863     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3864 
3865     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3866         assert supports(CPUFeature.AVX512BW);
3867         assert inRC(MASK, kdst) && inRC(MASK, mask);
3868         assert inRC(XMM, nds) && inRC(XMM, src);
3869         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3870         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3871         emitByte(0x3E);
3872         emitModRM(kdst, src);
3873         emitByte(vcc);
3874     }
3875 
3876     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3877     // -----
3878     // Insn: VPMOVWB m256, zmm2
3879 
3880     public final void evpmovwb(AMD64Address dst, Register src) {
3881         assert supports(CPUFeature.AVX512BW);
3882         assert inRC(XMM, src);
3883         // Code: EVEX.512.F3.0F38.W0 30 /r
3884         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3885         emitByte(0x30);
3886         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3887     }
3888 
3889     // Insn: VPMOVWB m256, k1, zmm2
3890 
3891     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
3892         assert supports(CPUFeature.AVX512BW);
3893         assert inRC(MASK, mask) && inRC(XMM, src);
3894         // Code: EVEX.512.F3.0F38.W0 30 /r
3895         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3896         emitByte(0x30);
3897         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3898     }
3899 
3900     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
3901     // -----
3902     // Insn: VPMOVZXBW zmm1, k1, m256
3903 
3904     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
3905         assert supports(CPUFeature.AVX512BW);
3906         assert inRC(MASK, mask) && inRC(XMM, dst);
3907         // Code: EVEX.512.66.0F38.WIG 30 /r
3908         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3909         emitByte(0x30);
3910         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3911     }
3912 
3913 }