1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIntelNops;
  33 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  44 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  68 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  69 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  70 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  71 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  72 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  73 
  74 import java.util.EnumSet;
  75 
  76 import org.graalvm.compiler.asm.Label;
  77 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  78 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  79 import org.graalvm.compiler.core.common.calc.Condition;
  80 import org.graalvm.compiler.debug.GraalError;
  81 
  82 import jdk.vm.ci.amd64.AMD64;
  83 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  84 import jdk.vm.ci.code.Register;
  85 import jdk.vm.ci.code.Register.RegisterCategory;
  86 import jdk.vm.ci.code.TargetDescription;
  87 
  88 /**
  89  * This class implements an assembler that can encode most X86 instructions.
  90  */
  91 public class AMD64Assembler extends AMD64BaseAssembler {
  92 
  93     /**
  94      * Constructs an assembler for the AMD64 architecture.
  95      */
  96     public AMD64Assembler(TargetDescription target) {
  97         super(target);
  98     }
  99 
 100     /**
 101      * The x86 condition codes used for conditional jumps/moves.
 102      */
 103     public enum ConditionFlag {
 104         Zero(0x4, "|zero|"),
 105         NotZero(0x5, "|nzero|"),
 106         Equal(0x4, "="),
 107         NotEqual(0x5, "!="),
 108         Less(0xc, "<"),
 109         LessEqual(0xe, "<="),
 110         Greater(0xf, ">"),
 111         GreaterEqual(0xd, ">="),
 112         Below(0x2, "|<|"),
 113         BelowEqual(0x6, "|<=|"),
 114         Above(0x7, "|>|"),
 115         AboveEqual(0x3, "|>=|"),
 116         Overflow(0x0, "|of|"),
 117         NoOverflow(0x1, "|nof|"),
 118         CarrySet(0x2, "|carry|"),
 119         CarryClear(0x3, "|ncarry|"),
 120         Negative(0x8, "|neg|"),
 121         Positive(0x9, "|pos|"),
 122         Parity(0xa, "|par|"),
 123         NoParity(0xb, "|npar|");
 124 
 125         private final int value;
 126         private final String operator;
 127 
 128         ConditionFlag(int value, String operator) {
 129             this.value = value;
 130             this.operator = operator;
 131         }
 132 
 133         public ConditionFlag negate() {
 134             switch (this) {
 135                 case Zero:
 136                     return NotZero;
 137                 case NotZero:
 138                     return Zero;
 139                 case Equal:
 140                     return NotEqual;
 141                 case NotEqual:
 142                     return Equal;
 143                 case Less:
 144                     return GreaterEqual;
 145                 case LessEqual:
 146                     return Greater;
 147                 case Greater:
 148                     return LessEqual;
 149                 case GreaterEqual:
 150                     return Less;
 151                 case Below:
 152                     return AboveEqual;
 153                 case BelowEqual:
 154                     return Above;
 155                 case Above:
 156                     return BelowEqual;
 157                 case AboveEqual:
 158                     return Below;
 159                 case Overflow:
 160                     return NoOverflow;
 161                 case NoOverflow:
 162                     return Overflow;
 163                 case CarrySet:
 164                     return CarryClear;
 165                 case CarryClear:
 166                     return CarrySet;
 167                 case Negative:
 168                     return Positive;
 169                 case Positive:
 170                     return Negative;
 171                 case Parity:
 172                     return NoParity;
 173                 case NoParity:
 174                     return Parity;
 175             }
 176             throw new IllegalArgumentException();
 177         }
 178 
 179         public int getValue() {
 180             return value;
 181         }
 182 
 183         @Override
 184         public String toString() {
 185             return operator;
 186         }
 187     }
 188 
 189     /**
 190      * Operand size and register type constraints.
 191      */
 192     private enum OpAssertion {
 193         ByteAssertion(CPU, CPU, BYTE),
 194         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 195         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 196         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 197         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 198         QwordAssertion(CPU, CPU, QWORD),
 199         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 200         PackedFloatAssertion(XMM, XMM, PS, PD),
 201         SingleAssertion(XMM, XMM, SS),
 202         DoubleAssertion(XMM, XMM, SD),
 203         PackedDoubleAssertion(XMM, XMM, PD),
 204         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 205         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 206 
 207         private final RegisterCategory resultCategory;
 208         private final RegisterCategory inputCategory;
 209         private final OperandSize[] allowedSizes;
 210 
 211         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 212             this.resultCategory = resultCategory;
 213             this.inputCategory = inputCategory;
 214             this.allowedSizes = allowedSizes;
 215         }
 216 
 217         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 218             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 219             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 220 
 221             for (OperandSize s : allowedSizes) {
 222                 if (size == s) {
 223                     return true;
 224                 }
 225             }
 226 
 227             assert false : "invalid operand size " + size + " used in " + op;
 228             return false;
 229         }
 230 
 231     }
 232 
 233     protected static final int P_0F = 0x0F;
 234     protected static final int P_0F38 = 0x380F;
 235     protected static final int P_0F3A = 0x3A0F;
 236 
 237     /**
 238      * Base class for AMD64 opcodes.
 239      */
 240     public static class AMD64Op {
 241 
 242         private final String opcode;
 243 
 244         protected final int prefix1;
 245         protected final int prefix2;
 246         protected final int op;
 247 
 248         private final boolean dstIsByte;
 249         private final boolean srcIsByte;
 250 
 251         private final OpAssertion assertion;
 252         private final CPUFeature feature;
 253 
 254         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 255             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 256         }
 257 
 258         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 259             this.opcode = opcode;
 260             this.prefix1 = prefix1;
 261             this.prefix2 = prefix2;
 262             this.op = op;
 263 
 264             this.dstIsByte = dstIsByte;
 265             this.srcIsByte = srcIsByte;
 266 
 267             this.assertion = assertion;
 268             this.feature = feature;
 269         }
 270 
 271         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 272             if (prefix1 != 0) {
 273                 asm.emitByte(prefix1);
 274             }
 275             if (size.getSizePrefix() != 0) {
 276                 asm.emitByte(size.getSizePrefix());
 277             }
 278             int rexPrefix = 0x40 | rxb;
 279             if (size == QWORD) {
 280                 rexPrefix |= 0x08;
 281             }
 282             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 283                 asm.emitByte(rexPrefix);
 284             }
 285             if (prefix2 > 0xFF) {
 286                 asm.emitShort(prefix2);
 287             } else if (prefix2 > 0) {
 288                 asm.emitByte(prefix2);
 289             }
 290             asm.emitByte(op);
 291         }
 292 
 293         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 294             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 295             assert assertion.checkOperands(this, size, resultReg, inputReg);
 296             return true;
 297         }
 298 
 299         public OperandSize[] getAllowedSizes() {
 300             return assertion.allowedSizes;
 301         }
 302 
 303         protected final boolean isSSEInstruction() {
 304             if (feature == null) {
 305                 return false;
 306             }
 307             switch (feature) {
 308                 case SSE:
 309                 case SSE2:
 310                 case SSE3:
 311                 case SSSE3:
 312                 case SSE4A:
 313                 case SSE4_1:
 314                 case SSE4_2:
 315                     return true;
 316                 default:
 317                     return false;
 318             }
 319         }
 320 
 321         public final OpAssertion getAssertion() {
 322             return assertion;
 323         }
 324 
 325         @Override
 326         public String toString() {
 327             return opcode;
 328         }
 329     }
 330 
 331     /**
 332      * Base class for AMD64 opcodes with immediate operands.
 333      */
 334     public static class AMD64ImmOp extends AMD64Op {
 335 
 336         private final boolean immIsByte;
 337 
 338         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 339             this(opcode, immIsByte, prefix, op, assertion, null);
 340         }
 341 
 342         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 343             super(opcode, 0, prefix, op, assertion, feature);
 344             this.immIsByte = immIsByte;
 345         }
 346 
 347         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 348             if (immIsByte) {
 349                 assert imm == (byte) imm;
 350                 asm.emitByte(imm);
 351             } else {
 352                 size.emitImmediate(asm, imm);
 353             }
 354         }
 355 
 356         protected final int immediateSize(OperandSize size) {
 357             if (immIsByte) {
 358                 return 1;
 359             } else {
 360                 return size.getBytes();
 361             }
 362         }
 363     }
 364 
 365     /**
 366      * Opcode with operand order of either RM or MR for 2 address forms.
 367      */
 368     public abstract static class AMD64RROp extends AMD64Op {
 369 
 370         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 371             super(opcode, prefix1, prefix2, op, assertion, feature);
 372         }
 373 
 374         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 375             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 376         }
 377 
 378         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 379     }
 380 
 381     /**
 382      * Opcode with operand order of RM.
 383      */
 384     public static class AMD64RMOp extends AMD64RROp {
 385         // @formatter:off
 386         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 387         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 388         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 389         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 390         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 391         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 392         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 393         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 394         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 395         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 396         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 399         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 400         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 401         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 402         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 403 
 404         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 405         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 407         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 409 
 410         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 411         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 412         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 413         // @formatter:on
 414 
 415         protected AMD64RMOp(String opcode, int op) {
 416             this(opcode, 0, op);
 417         }
 418 
 419         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 420             this(opcode, 0, op, assertion);
 421         }
 422 
 423         protected AMD64RMOp(String opcode, int prefix, int op) {
 424             this(opcode, 0, prefix, op, null);
 425         }
 426 
 427         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 428             this(opcode, 0, prefix, op, assertion, null);
 429         }
 430 
 431         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 432             this(opcode, 0, prefix, op, assertion, feature);
 433         }
 434 
 435         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 436             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 437         }
 438 
 439         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 440             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 441         }
 442 
 443         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 444             super(opcode, prefix1, prefix2, op, assertion, feature);
 445         }
 446 
 447         @Override
 448         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 449             assert verify(asm, size, dst, src);
 450             if (isSSEInstruction()) {
 451                 Register nds = Register.None;
 452                 switch (op) {
 453                     case 0x10:
 454                     case 0x51:
 455                         if ((size == SS) || (size == SD)) {
 456                             nds = dst;
 457                         }
 458                         break;
 459                     case 0x2A:
 460                     case 0x54:
 461                     case 0x55:
 462                     case 0x56:
 463                     case 0x57:
 464                     case 0x58:
 465                     case 0x59:
 466                     case 0x5A:
 467                     case 0x5C:
 468                     case 0x5D:
 469                     case 0x5E:
 470                     case 0x5F:
 471                         nds = dst;
 472                         break;
 473                     default:
 474                         break;
 475                 }
 476                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 477                 asm.emitByte(op);
 478                 asm.emitModRM(dst, src);
 479             } else {
 480                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 481                 asm.emitModRM(dst, src);
 482             }
 483         }
 484 
 485         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 486             assert verify(asm, size, dst, null);
 487             if (isSSEInstruction()) {
 488                 Register nds = Register.None;
 489                 switch (op) {
 490                     case 0x51:
 491                         if ((size == SS) || (size == SD)) {
 492                             nds = dst;
 493                         }
 494                         break;
 495                     case 0x2A:
 496                     case 0x54:
 497                     case 0x55:
 498                     case 0x56:
 499                     case 0x57:
 500                     case 0x58:
 501                     case 0x59:
 502                     case 0x5A:
 503                     case 0x5C:
 504                     case 0x5D:
 505                     case 0x5E:
 506                     case 0x5F:
 507                         nds = dst;
 508                         break;
 509                     default:
 510                         break;
 511                 }
 512                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 513                 asm.emitByte(op);
 514                 asm.emitOperandHelper(dst, src, 0);
 515             } else {
 516                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 517                 asm.emitOperandHelper(dst, src, 0);
 518             }
 519         }
 520     }
 521 
 522     /**
 523      * Opcode with operand order of MR.
 524      */
 525     public static class AMD64MROp extends AMD64RROp {
 526         // @formatter:off
 527         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 528         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 529 
 530         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 531         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 532         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 534 
 535         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 536         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 538         // @formatter:on
 539 
 540         protected AMD64MROp(String opcode, int op) {
 541             this(opcode, 0, op);
 542         }
 543 
 544         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 545             this(opcode, 0, op, assertion);
 546         }
 547 
 548         protected AMD64MROp(String opcode, int prefix, int op) {
 549             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 550         }
 551 
 552         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 553             this(opcode, prefix, op, assertion, null);
 554         }
 555 
 556         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 557             this(opcode, 0, prefix, op, assertion, feature);
 558         }
 559 
 560         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 561             super(opcode, prefix1, prefix2, op, assertion, feature);
 562         }
 563 
 564         @Override
 565         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 566             assert verify(asm, size, src, dst);
 567             if (isSSEInstruction()) {
 568                 Register nds = Register.None;
 569                 switch (op) {
 570                     case 0x11:
 571                         if ((size == SS) || (size == SD)) {
 572                             nds = src;
 573                         }
 574                         break;
 575                     default:
 576                         break;
 577                 }
 578                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 579                 asm.emitByte(op);
 580                 asm.emitModRM(src, dst);
 581             } else {
 582                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 583                 asm.emitModRM(src, dst);
 584             }
 585         }
 586 
 587         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 588             assert verify(asm, size, src, null);
 589             if (isSSEInstruction()) {
 590                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 591                 asm.emitByte(op);
 592             } else {
 593                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 594             }
 595             asm.emitOperandHelper(src, dst, 0);
 596         }
 597     }
 598 
 599     /**
 600      * Opcodes with operand order of M.
 601      */
 602     public static class AMD64MOp extends AMD64Op {
 603         // @formatter:off
 604         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 605         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 606         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 607         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 608         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 609         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 610         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 611         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 612         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 613         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 614         // @formatter:on
 615 
 616         private final int ext;
 617 
 618         protected AMD64MOp(String opcode, int op, int ext) {
 619             this(opcode, 0, op, ext);
 620         }
 621 
 622         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 623             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 624         }
 625 
 626         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 627             this(opcode, 0, op, ext, assertion);
 628         }
 629 
 630         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 631             super(opcode, 0, prefix, op, assertion, null);
 632             this.ext = ext;
 633         }
 634 
 635         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 636             assert verify(asm, size, dst, null);
 637             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 638             asm.emitModRM(ext, dst);
 639         }
 640 
 641         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 642             assert verify(asm, size, null, null);
 643             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 644             asm.emitOperandHelper(ext, dst, 0);
 645         }
 646     }
 647 
 648     /**
 649      * Opcodes with operand order of MI.
 650      */
 651     public static class AMD64MIOp extends AMD64ImmOp {
 652         // @formatter:off
 653         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 654         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 655         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 656         // @formatter:on
 657 
 658         private final int ext;
 659 
 660         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 661             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 662         }
 663 
 664         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 665             this(opcode, immIsByte, 0, op, ext, assertion);
 666         }
 667 
 668         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 669             super(opcode, immIsByte, prefix, op, assertion);
 670             this.ext = ext;
 671         }
 672 
 673         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 674             emit(asm, size, dst, imm, false);
 675         }
 676 
 677         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 678             assert verify(asm, size, dst, null);
 679             int insnPos = asm.position();
 680             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 681             asm.emitModRM(ext, dst);
 682             int immPos = asm.position();
 683             emitImmediate(asm, size, imm);
 684             int nextInsnPos = asm.position();
 685             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 686                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 687             }
 688         }
 689 
 690         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 691             emit(asm, size, dst, imm, false);
 692         }
 693 
 694         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 695             assert verify(asm, size, null, null);
 696             int insnPos = asm.position();
 697             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 698             asm.emitOperandHelper(ext, dst, immediateSize(size));
 699             int immPos = asm.position();
 700             emitImmediate(asm, size, imm);
 701             int nextInsnPos = asm.position();
 702             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 703                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 704             }
 705         }
 706     }
 707 
 708     /**
 709      * Opcodes with operand order of RMI.
 710      *
 711      * We only have one form of round as the operation is always treated with single variant input,
 712      * making its extension to 3 address forms redundant.
 713      */
 714     public static class AMD64RMIOp extends AMD64ImmOp {
 715         // @formatter:off
 716         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 717         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 718         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 720         // @formatter:on
 721 
 722         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 723             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 724         }
 725 
 726         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 727             super(opcode, immIsByte, prefix, op, assertion, feature);
 728         }
 729 
 730         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 731             assert verify(asm, size, dst, src);
 732             if (isSSEInstruction()) {
 733                 Register nds = Register.None;
 734                 switch (op) {
 735                     case 0x0A:
 736                     case 0x0B:
 737                         nds = dst;
 738                         break;
 739                     default:
 740                         break;
 741                 }
 742                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 743                 asm.emitByte(op);
 744                 asm.emitModRM(dst, src);
 745             } else {
 746                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 747                 asm.emitModRM(dst, src);
 748             }
 749             emitImmediate(asm, size, imm);
 750         }
 751 
 752         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 753             assert verify(asm, size, dst, null);
 754             if (isSSEInstruction()) {
 755                 Register nds = Register.None;
 756                 switch (op) {
 757                     case 0x0A:
 758                     case 0x0B:
 759                         nds = dst;
 760                         break;
 761                     default:
 762                         break;
 763                 }
 764                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 765                 asm.emitByte(op);
 766             } else {
 767                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 768             }
 769             asm.emitOperandHelper(dst, src, immediateSize(size));
 770             emitImmediate(asm, size, imm);
 771         }
 772     }
 773 
 774     public static class SSEOp extends AMD64RMOp {
 775         // @formatter:off
 776         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 778         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 780         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 781         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 782         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 786         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 787         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 788         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 789         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 790         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 791         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 792         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 793         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 794         // @formatter:on
 795 
 796         protected SSEOp(String opcode, int prefix, int op) {
 797             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 798         }
 799 
 800         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 801             this(opcode, 0, prefix, op, assertion);
 802         }
 803 
 804         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 805             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 806         }
 807     }
 808 
 809     /**
 810      * Arithmetic operation with operand order of RM, MR or MI.
 811      */
 812     public static final class AMD64BinaryArithmetic {
 813         // @formatter:off
 814         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 815         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 816         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 817         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 818         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 819         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 820         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 821         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 822         // @formatter:on
 823 
 824         private final AMD64MIOp byteImmOp;
 825         private final AMD64MROp byteMrOp;
 826         private final AMD64RMOp byteRmOp;
 827 
 828         private final AMD64MIOp immOp;
 829         private final AMD64MIOp immSxOp;
 830         private final AMD64MROp mrOp;
 831         private final AMD64RMOp rmOp;
 832 
 833         private AMD64BinaryArithmetic(String opcode, int code) {
 834             int baseOp = code << 3;
 835 
 836             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 837             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 838             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 839 
 840             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 841             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 842             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 843             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 844         }
 845 
 846         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 847             if (size == BYTE) {
 848                 return byteImmOp;
 849             } else if (sx) {
 850                 return immSxOp;
 851             } else {
 852                 return immOp;
 853             }
 854         }
 855 
 856         public AMD64MROp getMROpcode(OperandSize size) {
 857             if (size == BYTE) {
 858                 return byteMrOp;
 859             } else {
 860                 return mrOp;
 861             }
 862         }
 863 
 864         public AMD64RMOp getRMOpcode(OperandSize size) {
 865             if (size == BYTE) {
 866                 return byteRmOp;
 867             } else {
 868                 return rmOp;
 869             }
 870         }
 871     }
 872 
 873     /**
 874      * Shift operation with operand order of M1, MC or MI.
 875      */
 876     public static final class AMD64Shift {
 877         // @formatter:off
 878         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 879         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 880         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 881         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 882         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 883         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 884         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 885         // @formatter:on
 886 
 887         public final AMD64MOp m1Op;
 888         public final AMD64MOp mcOp;
 889         public final AMD64MIOp miOp;
 890 
 891         private AMD64Shift(String opcode, int code) {
 892             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 893             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 894             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 895         }
 896     }
 897 
 898     private enum VEXOpAssertion {
 899         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 900         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 901         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 902         AVX1_128ONLY(CPUFeature.AVX, null),
 903         AVX1_256ONLY(null, CPUFeature.AVX),
 904         AVX2_256ONLY(null, CPUFeature.AVX2),
 905         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 906         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 907         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 908         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 909         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 910         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null),
 911         FMA(CPUFeature.FMA, null, XMM, XMM, XMM, null);
 912 
 913         private final CPUFeature l128feature;
 914         private final CPUFeature l256feature;
 915 
 916         private final RegisterCategory rCategory;
 917         private final RegisterCategory vCategory;
 918         private final RegisterCategory mCategory;
 919         private final RegisterCategory imm8Category;
 920 
 921         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 922             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 923         }
 924 
 925         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 926             this.l128feature = l128feature;
 927             this.l256feature = l256feature;
 928             this.rCategory = rCategory;
 929             this.vCategory = vCategory;
 930             this.mCategory = mCategory;
 931             this.imm8Category = imm8Category;
 932         }
 933 
 934         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 935             return check(arch, getLFlag(size), r, v, m, null);
 936         }
 937 
 938         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 939             return check(arch, getLFlag(size), r, v, m, imm8);
 940         }
 941 
 942         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 943             switch (l) {
 944                 case L128:
 945                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 946                     break;
 947                 case L256:
 948                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 949                     break;
 950             }
 951             if (r != null) {
 952                 assert r.getRegisterCategory().equals(rCategory);
 953             }
 954             if (v != null) {
 955                 assert v.getRegisterCategory().equals(vCategory);
 956             }
 957             if (m != null) {
 958                 assert m.getRegisterCategory().equals(mCategory);
 959             }
 960             if (imm8 != null) {
 961                 assert imm8.getRegisterCategory().equals(imm8Category);
 962             }
 963             return true;
 964         }
 965 
 966         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 967             switch (avxSize) {
 968                 case XMM:
 969                     return l128feature != null && features.contains(l128feature);
 970                 case YMM:
 971                     return l256feature != null && features.contains(l256feature);
 972                 default:
 973                     throw GraalError.shouldNotReachHere();
 974             }
 975         }
 976     }
 977 
 978     /**
 979      * Base class for VEX-encoded instructions.
 980      */
 981     public static class VexOp {
 982         protected final int pp;
 983         protected final int mmmmm;
 984         protected final int w;
 985         protected final int op;
 986 
 987         private final String opcode;
 988         protected final VEXOpAssertion assertion;
 989 
 990         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 991             this.pp = pp;
 992             this.mmmmm = mmmmm;
 993             this.w = w;
 994             this.op = op;
 995             this.opcode = opcode;
 996             this.assertion = assertion;
 997         }
 998 
 999         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
1000             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
1001         }
1002 
1003         @Override
1004         public String toString() {
1005             return opcode;
1006         }
1007     }
1008 
1009     /**
1010      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1011      */
1012     public static class VexRROp extends VexOp {
1013         // @formatter:off
1014         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1015         // @formatter:on
1016 
1017         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1018             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1019         }
1020 
1021         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1022             super(opcode, pp, mmmmm, w, op, assertion);
1023         }
1024 
1025         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1026             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1027             assert op != 0x1A || op != 0x5A;
1028             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1029             asm.emitByte(op);
1030             asm.emitModRM(dst, src);
1031         }
1032     }
1033 
1034     /**
1035      * VEX-encoded instructions with an operand order of RM.
1036      */
1037     public static class VexRMOp extends VexRROp {
1038         // @formatter:off
1039         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1042         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1043         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1044         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1045         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1046         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1047         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1048         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1049         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1050         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1051         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1052         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1053         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1056         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1057         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1058         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1059         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1060         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1061         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1062         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1063         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1064         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1065         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1066         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1067         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1068         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1069         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1070         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1071         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1073         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1074         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1075         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1076         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1077         // @formatter:on
1078 
1079         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1080             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1081         }
1082 
1083         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1084             super(opcode, pp, mmmmm, w, op, assertion);
1085         }
1086 
1087         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1088             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1089             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1090             asm.emitByte(op);
1091             asm.emitOperandHelper(dst, src, 0);
1092         }
1093     }
1094 
1095     /**
1096      * VEX-encoded move instructions.
1097      * <p>
1098      * These instructions have two opcodes: op is the forward move instruction with an operand order
1099      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1100      */
1101     public static final class VexMoveOp extends VexRMOp {
1102         // @formatter:off
1103         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1104         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1105         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1106         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1107         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1110         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1111         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1112         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1113         // @formatter:on
1114 
1115         private final int opReverse;
1116 
1117         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1118             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1119         }
1120 
1121         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1122             super(opcode, pp, mmmmm, w, op, assertion);
1123             this.opReverse = opReverse;
1124         }
1125 
1126         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1127             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1128             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1129             asm.emitByte(opReverse);
1130             asm.emitOperandHelper(src, dst, 0);
1131         }
1132 
1133         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1134             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1135             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1136             asm.emitByte(opReverse);
1137             asm.emitModRM(src, dst);
1138         }
1139     }
1140 
1141     public interface VexRRIOp {
1142         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1143     }
1144 
1145     /**
1146      * VEX-encoded instructions with an operand order of RMI.
1147      */
1148     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1149         // @formatter:off
1150         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1151         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1153         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1154         // @formatter:on
1155 
1156         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1157             super(opcode, pp, mmmmm, w, op, assertion);
1158         }
1159 
1160         @Override
1161         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1162             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1163             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1164             asm.emitByte(op);
1165             asm.emitModRM(dst, src);
1166             asm.emitByte(imm8);
1167         }
1168 
1169         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1170             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1171             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1172             asm.emitByte(op);
1173             asm.emitOperandHelper(dst, src, 1);
1174             asm.emitByte(imm8);
1175         }
1176     }
1177 
1178     /**
1179      * VEX-encoded instructions with an operand order of MRI.
1180      */
1181     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1182         // @formatter:off
1183         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1184         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1185         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1187         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1188         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1189         // @formatter:on
1190 
1191         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1192             super(opcode, pp, mmmmm, w, op, assertion);
1193         }
1194 
1195         @Override
1196         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1197             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1198             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1199             asm.emitByte(op);
1200             asm.emitModRM(src, dst);
1201             asm.emitByte(imm8);
1202         }
1203 
1204         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1205             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1206             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1207             asm.emitByte(op);
1208             asm.emitOperandHelper(src, dst, 1);
1209             asm.emitByte(imm8);
1210         }
1211     }
1212 
1213     /**
1214      * VEX-encoded instructions with an operand order of RVMR.
1215      */
1216     public static class VexRVMROp extends VexOp {
1217         // @formatter:off
1218         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1219         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1220         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1221         // @formatter:on
1222 
1223         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1224             super(opcode, pp, mmmmm, w, op, assertion);
1225         }
1226 
1227         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1228             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1229             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1230             asm.emitByte(op);
1231             asm.emitModRM(dst, src2);
1232             asm.emitByte(mask.encoding() << 4);
1233         }
1234 
1235         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1236             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1237             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1238             asm.emitByte(op);
1239             asm.emitOperandHelper(dst, src2, 0);
1240             asm.emitByte(mask.encoding() << 4);
1241         }
1242     }
1243 
1244     /**
1245      * VEX-encoded instructions with an operand order of RVM.
1246      */
1247     public static class VexRVMOp extends VexOp {
1248         // @formatter:off
1249         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1250         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1251         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1252         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1253         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1254         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1255         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1256         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1257         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1260         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1261         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1264         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1265         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1268         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1269         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1272         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1273         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1276         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1277         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1280         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1281         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1282         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1283         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1298         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1299         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1300         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1301         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1304         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1305         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1311         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1312         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1313         public static final VexRVMOp VFMADD231SS = new VexRVMOp("VFMADD231SS", P_66, M_0F38, W0, 0xB9, VEXOpAssertion.FMA);
1314         public static final VexRVMOp VFMADD231SD = new VexRVMOp("VFMADD231SD", P_66, M_0F38, W1, 0xB9, VEXOpAssertion.FMA);
1315         // @formatter:on
1316 
1317         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1318             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1319         }
1320 
1321         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1322             super(opcode, pp, mmmmm, w, op, assertion);
1323         }
1324 
1325         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1326             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1327             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1328             asm.emitByte(op);
1329             asm.emitModRM(dst, src2);
1330         }
1331 
1332         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1333             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1334             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1335             asm.emitByte(op);
1336             asm.emitOperandHelper(dst, src2, 0);
1337         }
1338     }
1339 
1340     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1341         // @formatter:off
1342         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1343         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1344         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1345         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1346         // @formatter:on
1347 
1348         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1349             super(opcode, pp, mmmmm, w, op, assertion);
1350         }
1351 
1352         @Override
1353         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1354             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1355             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1356             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1357             asm.emitByte(op);
1358             asm.emitModRM(dst, src2);
1359         }
1360 
1361         @Override
1362         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1363             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1364             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1365             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1366             asm.emitByte(op);
1367             asm.emitOperandHelper(dst, src2, 0);
1368         }
1369     }
1370 
1371     public static final class VexGeneralPurposeRMVOp extends VexOp {
1372         // @formatter:off
1373         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1374         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1375         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1376         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1377         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1378         // @formatter:on
1379 
1380         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1381             super(opcode, pp, mmmmm, w, op, assertion);
1382         }
1383 
1384         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1385             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1386             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1387             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1388             asm.emitByte(op);
1389             asm.emitModRM(dst, src1);
1390         }
1391 
1392         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1393             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1394             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1395             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1396             asm.emitByte(op);
1397             asm.emitOperandHelper(dst, src1, 0);
1398         }
1399     }
1400 
1401     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1402         // @formatter:off
1403         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1404         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1405         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1406         // @formatter:on
1407         private final int ext;
1408 
1409         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1410             super(opcode, pp, mmmmm, w, op, assertion);
1411             this.ext = ext;
1412         }
1413 
1414         @Override
1415         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1416             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1417             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1418             asm.emitByte(op);
1419             asm.emitModRM(ext, src);
1420         }
1421 
1422         @Override
1423         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1424             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1425             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1426             asm.emitByte(op);
1427             asm.emitOperandHelper(ext, src, 0);
1428         }
1429     }
1430 
1431     /**
1432      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1433      */
1434     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1435         // @formatter:off
1436         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1437         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1438         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1439         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1440         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1441         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1442         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1443         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1444         // @formatter:on
1445 
1446         private final int immOp;
1447         private final int r;
1448 
1449         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1450             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1451             this.immOp = immOp;
1452             this.r = r;
1453         }
1454 
1455         @Override
1456         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1457             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1458             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1459             asm.emitByte(immOp);
1460             asm.emitModRM(r, src);
1461             asm.emitByte(imm8);
1462         }
1463     }
1464 
1465     public static final class VexMaskMoveOp extends VexOp {
1466         // @formatter:off
1467         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1468         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1469         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1470         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1471         // @formatter:on
1472 
1473         private final int opReverse;
1474 
1475         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1476             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1477         }
1478 
1479         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1480             super(opcode, pp, mmmmm, w, op, assertion);
1481             this.opReverse = opReverse;
1482         }
1483 
1484         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1485             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1486             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1487             asm.emitByte(op);
1488             asm.emitOperandHelper(dst, src, 0);
1489         }
1490 
1491         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1492             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1493             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1494             asm.emitByte(opReverse);
1495             asm.emitOperandHelper(src, dst, 0);
1496         }
1497     }
1498 
1499     /**
1500      * VEX-encoded instructions with an operand order of RVMI.
1501      */
1502     public static final class VexRVMIOp extends VexOp {
1503         // @formatter:off
1504         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1505         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1506         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1507         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1508         // @formatter:on
1509 
1510         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1511             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1512         }
1513 
1514         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1515             super(opcode, pp, mmmmm, w, op, assertion);
1516         }
1517 
1518         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1519             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1520             assert (imm8 & 0xFF) == imm8;
1521             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1522             asm.emitByte(op);
1523             asm.emitModRM(dst, src2);
1524             asm.emitByte(imm8);
1525         }
1526 
1527         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1528             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1529             assert (imm8 & 0xFF) == imm8;
1530             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1531             asm.emitByte(op);
1532             asm.emitOperandHelper(dst, src2, 1);
1533             asm.emitByte(imm8);
1534         }
1535     }
1536 
1537     /**
1538      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1539      * comparison operator.
1540      */
1541     public static final class VexFloatCompareOp extends VexOp {
1542         // @formatter:off
1543         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1544         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1545         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1546         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1547         // @formatter:on
1548 
1549         public enum Predicate {
1550             EQ_OQ(0x00),
1551             LT_OS(0x01),
1552             LE_OS(0x02),
1553             UNORD_Q(0x03),
1554             NEQ_UQ(0x04),
1555             NLT_US(0x05),
1556             NLE_US(0x06),
1557             ORD_Q(0x07),
1558             EQ_UQ(0x08),
1559             NGE_US(0x09),
1560             NGT_US(0x0a),
1561             FALSE_OQ(0x0b),
1562             NEQ_OQ(0x0c),
1563             GE_OS(0x0d),
1564             GT_OS(0x0e),
1565             TRUE_UQ(0x0f),
1566             EQ_OS(0x10),
1567             LT_OQ(0x11),
1568             LE_OQ(0x12),
1569             UNORD_S(0x13),
1570             NEQ_US(0x14),
1571             NLT_UQ(0x15),
1572             NLE_UQ(0x16),
1573             ORD_S(0x17),
1574             EQ_US(0x18),
1575             NGE_UQ(0x19),
1576             NGT_UQ(0x1a),
1577             FALSE_OS(0x1b),
1578             NEQ_OS(0x1c),
1579             GE_OQ(0x1d),
1580             GT_OQ(0x1e),
1581             TRUE_US(0x1f);
1582 
1583             private int imm8;
1584 
1585             Predicate(int imm8) {
1586                 this.imm8 = imm8;
1587             }
1588 
1589             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1590                 if (unorderedIsTrue) {
1591                     switch (condition) {
1592                         case EQ:
1593                             return EQ_UQ;
1594                         case NE:
1595                             return NEQ_UQ;
1596                         case LT:
1597                             return NGE_UQ;
1598                         case LE:
1599                             return NGT_UQ;
1600                         case GT:
1601                             return NLE_UQ;
1602                         case GE:
1603                             return NLT_UQ;
1604                         default:
1605                             throw GraalError.shouldNotReachHere();
1606                     }
1607                 } else {
1608                     switch (condition) {
1609                         case EQ:
1610                             return EQ_OQ;
1611                         case NE:
1612                             return NEQ_OQ;
1613                         case LT:
1614                             return LT_OQ;
1615                         case LE:
1616                             return LE_OQ;
1617                         case GT:
1618                             return GT_OQ;
1619                         case GE:
1620                             return GE_OQ;
1621                         default:
1622                             throw GraalError.shouldNotReachHere();
1623                     }
1624                 }
1625             }
1626         }
1627 
1628         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1629             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1630         }
1631 
1632         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1633             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1634             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1635             asm.emitByte(op);
1636             asm.emitModRM(dst, src2);
1637             asm.emitByte(p.imm8);
1638         }
1639 
1640         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1641             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1642             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1643             asm.emitByte(op);
1644             asm.emitOperandHelper(dst, src2, 1);
1645             asm.emitByte(p.imm8);
1646         }
1647     }
1648 
1649     public final void addl(AMD64Address dst, int imm32) {
1650         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1651     }
1652 
1653     public final void addl(Register dst, int imm32) {
1654         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1655     }
1656 
1657     public final void addl(Register dst, Register src) {
1658         ADD.rmOp.emit(this, DWORD, dst, src);
1659     }
1660 
1661     public final void addpd(Register dst, Register src) {
1662         SSEOp.ADD.emit(this, PD, dst, src);
1663     }
1664 
1665     public final void addpd(Register dst, AMD64Address src) {
1666         SSEOp.ADD.emit(this, PD, dst, src);
1667     }
1668 
1669     public final void addsd(Register dst, Register src) {
1670         SSEOp.ADD.emit(this, SD, dst, src);
1671     }
1672 
1673     public final void addsd(Register dst, AMD64Address src) {
1674         SSEOp.ADD.emit(this, SD, dst, src);
1675     }
1676 
1677     private void addrNop4() {
1678         // 4 bytes: NOP DWORD PTR [EAX+0]
1679         emitByte(0x0F);
1680         emitByte(0x1F);
1681         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1682         emitByte(0); // 8-bits offset (1 byte)
1683     }
1684 
1685     private void addrNop5() {
1686         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1687         emitByte(0x0F);
1688         emitByte(0x1F);
1689         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1690         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1691         emitByte(0); // 8-bits offset (1 byte)
1692     }
1693 
1694     private void addrNop7() {
1695         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1696         emitByte(0x0F);
1697         emitByte(0x1F);
1698         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1699         emitInt(0); // 32-bits offset (4 bytes)
1700     }
1701 
1702     private void addrNop8() {
1703         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1704         emitByte(0x0F);
1705         emitByte(0x1F);
1706         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1707         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1708         emitInt(0); // 32-bits offset (4 bytes)
1709     }
1710 
1711     public final void andl(Register dst, int imm32) {
1712         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1713     }
1714 
1715     public final void andl(Register dst, Register src) {
1716         AND.rmOp.emit(this, DWORD, dst, src);
1717     }
1718 
1719     public final void andpd(Register dst, Register src) {
1720         SSEOp.AND.emit(this, PD, dst, src);
1721     }
1722 
1723     public final void andpd(Register dst, AMD64Address src) {
1724         SSEOp.AND.emit(this, PD, dst, src);
1725     }
1726 
1727     public final void bsfq(Register dst, Register src) {
1728         prefixq(dst, src);
1729         emitByte(0x0F);
1730         emitByte(0xBC);
1731         emitModRM(dst, src);
1732     }
1733 
1734     public final void bsrl(Register dst, Register src) {
1735         prefix(dst, src);
1736         emitByte(0x0F);
1737         emitByte(0xBD);
1738         emitModRM(dst, src);
1739     }
1740 
1741     public final void bswapl(Register reg) {
1742         prefix(reg);
1743         emitByte(0x0F);
1744         emitModRM(1, reg);
1745     }
1746 
1747     public final void cdql() {
1748         emitByte(0x99);
1749     }
1750 
1751     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1752         prefix(dst, src);
1753         emitByte(0x0F);
1754         emitByte(0x40 | cc.getValue());
1755         emitModRM(dst, src);
1756     }
1757 
1758     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1759         prefix(src, dst);
1760         emitByte(0x0F);
1761         emitByte(0x40 | cc.getValue());
1762         emitOperandHelper(dst, src, 0);
1763     }
1764 
1765     public final void cmpb(Register dst, Register src) {
1766         CMP.byteRmOp.emit(this, BYTE, dst, src);
1767     }
1768 
1769     public final void cmpw(Register dst, Register src) {
1770         CMP.rmOp.emit(this, WORD, dst, src);
1771     }
1772 
1773     public final void cmpl(Register dst, int imm32) {
1774         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1775     }
1776 
1777     public final void cmpl(Register dst, Register src) {
1778         CMP.rmOp.emit(this, DWORD, dst, src);
1779     }
1780 
1781     public final void cmpl(Register dst, AMD64Address src) {
1782         CMP.rmOp.emit(this, DWORD, dst, src);
1783     }
1784 
1785     public final void cmpl(AMD64Address dst, int imm32) {
1786         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1787     }
1788 
1789     /**
1790      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1791      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1792      * values were equal, and cleared otherwise.
1793      */
1794     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1795         prefixb(adr, reg);
1796         emitByte(0x0F);
1797         emitByte(0xB0);
1798         emitOperandHelper(reg, adr, 0);
1799     }
1800 
1801     /**
1802      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1803      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1804      * compared values were equal, and cleared otherwise.
1805      */
1806     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1807         emitByte(0x66); // Switch to 16-bit mode.
1808         prefix(adr, reg);
1809         emitByte(0x0F);
1810         emitByte(0xB1);
1811         emitOperandHelper(reg, adr, 0);
1812     }
1813 
1814     /**
1815      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1816      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1817      * compared values were equal, and cleared otherwise.
1818      */
1819     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1820         prefix(adr, reg);
1821         emitByte(0x0F);
1822         emitByte(0xB1);
1823         emitOperandHelper(reg, adr, 0);
1824     }
1825 
1826     public final void cvtsi2sdl(Register dst, Register src) {
1827         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1828     }
1829 
1830     public final void cvttsd2sil(Register dst, Register src) {
1831         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1832     }
1833 
1834     public final void decl(AMD64Address dst) {
1835         prefix(dst);
1836         emitByte(0xFF);
1837         emitOperandHelper(1, dst, 0);
1838     }
1839 
1840     public final void divsd(Register dst, Register src) {
1841         SSEOp.DIV.emit(this, SD, dst, src);
1842     }
1843 
1844     public final void hlt() {
1845         emitByte(0xF4);
1846     }
1847 
1848     public final void imull(Register dst, Register src, int value) {
1849         if (isByte(value)) {
1850             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1851         } else {
1852             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1853         }
1854     }
1855 
1856     public final void incl(AMD64Address dst) {
1857         prefix(dst);
1858         emitByte(0xFF);
1859         emitOperandHelper(0, dst, 0);
1860     }
1861 
1862     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1863         int shortSize = 2;
1864         int longSize = 6;
1865         long disp = jumpTarget - position();
1866         if (!forceDisp32 && isByte(disp - shortSize)) {
1867             // 0111 tttn #8-bit disp
1868             emitByte(0x70 | cc.getValue());
1869             emitByte((int) ((disp - shortSize) & 0xFF));
1870         } else {
1871             // 0000 1111 1000 tttn #32-bit disp
1872             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1873             emitByte(0x0F);
1874             emitByte(0x80 | cc.getValue());
1875             emitInt((int) (disp - longSize));
1876         }
1877     }
1878 
1879     public final void jcc(ConditionFlag cc, Label l) {
1880         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1881         if (l.isBound()) {
1882             jcc(cc, l.position(), false);
1883         } else {
1884             // Note: could eliminate cond. jumps to this jump if condition
1885             // is the same however, seems to be rather unlikely case.
1886             // Note: use jccb() if label to be bound is very close to get
1887             // an 8-bit displacement
1888             l.addPatchAt(position(), this);
1889             emitByte(0x0F);
1890             emitByte(0x80 | cc.getValue());
1891             emitInt(0);
1892         }
1893 
1894     }
1895 
1896     public final void jccb(ConditionFlag cc, Label l) {
1897         if (l.isBound()) {
1898             int shortSize = 2;
1899             int entry = l.position();
1900             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1901             long disp = entry - position();
1902             // 0111 tttn #8-bit disp
1903             emitByte(0x70 | cc.getValue());
1904             emitByte((int) ((disp - shortSize) & 0xFF));
1905         } else {
1906             l.addPatchAt(position(), this);
1907             emitByte(0x70 | cc.getValue());
1908             emitByte(0);
1909         }
1910     }
1911 
1912     public final void jmp(int jumpTarget, boolean forceDisp32) {
1913         int shortSize = 2;
1914         int longSize = 5;
1915         long disp = jumpTarget - position();
1916         if (!forceDisp32 && isByte(disp - shortSize)) {
1917             emitByte(0xEB);
1918             emitByte((int) ((disp - shortSize) & 0xFF));
1919         } else {
1920             emitByte(0xE9);
1921             emitInt((int) (disp - longSize));
1922         }
1923     }
1924 
1925     @Override
1926     public final void jmp(Label l) {
1927         if (l.isBound()) {
1928             jmp(l.position(), false);
1929         } else {
1930             // By default, forward jumps are always 32-bit displacements, since
1931             // we can't yet know where the label will be bound. If you're sure that
1932             // the forward jump will not run beyond 256 bytes, use jmpb to
1933             // force an 8-bit displacement.
1934 
1935             l.addPatchAt(position(), this);
1936             emitByte(0xE9);
1937             emitInt(0);
1938         }
1939     }
1940 
1941     public final void jmp(Register entry) {
1942         prefix(entry);
1943         emitByte(0xFF);
1944         emitModRM(4, entry);
1945     }
1946 
1947     public final void jmp(AMD64Address adr) {
1948         prefix(adr);
1949         emitByte(0xFF);
1950         emitOperandHelper(AMD64.rsp, adr, 0);
1951     }
1952 
1953     public final void jmpb(Label l) {
1954         if (l.isBound()) {
1955             int shortSize = 2;
1956             // Displacement is relative to byte just after jmpb instruction
1957             int displacement = l.position() - position() - shortSize;
1958             GraalError.guarantee(isByte(displacement), "Displacement too large to be encoded as a byte: %d", displacement);
1959             emitByte(0xEB);
1960             emitByte(displacement & 0xFF);
1961         } else {
1962             l.addPatchAt(position(), this);
1963             emitByte(0xEB);
1964             emitByte(0);
1965         }
1966     }
1967 
1968     public final void lead(Register dst, AMD64Address src) {
1969         prefix(src, dst);
1970         emitByte(0x8D);
1971         emitOperandHelper(dst, src, 0);
1972     }
1973 
1974     public final void leaq(Register dst, AMD64Address src) {
1975         prefixq(src, dst);
1976         emitByte(0x8D);
1977         emitOperandHelper(dst, src, 0);
1978     }
1979 
1980     public final void leave() {
1981         emitByte(0xC9);
1982     }
1983 
1984     public final void lock() {
1985         emitByte(0xF0);
1986     }
1987 
1988     public final void movapd(Register dst, Register src) {
1989         assert inRC(XMM, dst) && inRC(XMM, src);
1990         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1991         emitByte(0x28);
1992         emitModRM(dst, src);
1993     }
1994 
1995     public final void movaps(Register dst, Register src) {
1996         assert inRC(XMM, dst) && inRC(XMM, src);
1997         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1998         emitByte(0x28);
1999         emitModRM(dst, src);
2000     }
2001 
2002     public final void movb(AMD64Address dst, int imm8) {
2003         prefix(dst);
2004         emitByte(0xC6);
2005         emitOperandHelper(0, dst, 1);
2006         emitByte(imm8);
2007     }
2008 
2009     public final void movb(AMD64Address dst, Register src) {
2010         assert inRC(CPU, src) : "must have byte register";
2011         prefixb(dst, src);
2012         emitByte(0x88);
2013         emitOperandHelper(src, dst, 0);
2014     }
2015 
2016     public final void movl(Register dst, int imm32) {
2017         movl(dst, imm32, false);
2018     }
2019 
2020     public final void movl(Register dst, int imm32, boolean annotateImm) {
2021         int insnPos = position();
2022         prefix(dst);
2023         emitByte(0xB8 + encode(dst));
2024         int immPos = position();
2025         emitInt(imm32);
2026         int nextInsnPos = position();
2027         if (annotateImm && codePatchingAnnotationConsumer != null) {
2028             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2029         }
2030     }
2031 
2032     public final void movl(Register dst, Register src) {
2033         prefix(dst, src);
2034         emitByte(0x8B);
2035         emitModRM(dst, src);
2036     }
2037 
2038     public final void movl(Register dst, AMD64Address src) {
2039         prefix(src, dst);
2040         emitByte(0x8B);
2041         emitOperandHelper(dst, src, 0);
2042     }
2043 
2044     /**
2045      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2046      */
2047     public final void movl(Register dst, AMD64Address src, boolean wide) {
2048         prefix(src, dst);
2049         emitByte(0x8B);
2050         emitOperandHelper(dst, src, wide, 0);
2051     }
2052 
2053     public final void movl(AMD64Address dst, int imm32) {
2054         prefix(dst);
2055         emitByte(0xC7);
2056         emitOperandHelper(0, dst, 4);
2057         emitInt(imm32);
2058     }
2059 
2060     public final void movl(AMD64Address dst, Register src) {
2061         prefix(dst, src);
2062         emitByte(0x89);
2063         emitOperandHelper(src, dst, 0);
2064     }
2065 
2066     /**
2067      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2068      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2069      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2070      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2071      */
2072     public final void movlpd(Register dst, AMD64Address src) {
2073         assert inRC(XMM, dst);
2074         simdPrefix(dst, dst, src, PD, P_0F, false);
2075         emitByte(0x12);
2076         emitOperandHelper(dst, src, 0);
2077     }
2078 
2079     public final void movlhps(Register dst, Register src) {
2080         assert inRC(XMM, dst) && inRC(XMM, src);
2081         simdPrefix(dst, src, src, PS, P_0F, false);
2082         emitByte(0x16);
2083         emitModRM(dst, src);
2084     }
2085 
2086     public final void movq(Register dst, AMD64Address src) {
2087         movq(dst, src, false);
2088     }
2089 
2090     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2091         if (inRC(XMM, dst)) {
2092             // Insn: MOVQ xmm, r/m64
2093             // Code: F3 0F 7E /r
2094             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2095             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2096             // when applicable.
2097             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2098             emitByte(0x7E);
2099             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2100         } else {
2101             // gpr version of movq
2102             prefixq(src, dst);
2103             emitByte(0x8B);
2104             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2105         }
2106     }
2107 
2108     public final void movq(Register dst, Register src) {
2109         assert inRC(CPU, dst) && inRC(CPU, src);
2110         prefixq(dst, src);
2111         emitByte(0x8B);
2112         emitModRM(dst, src);
2113     }
2114 
2115     public final void movq(AMD64Address dst, Register src) {
2116         if (inRC(XMM, src)) {
2117             // Insn: MOVQ r/m64, xmm
2118             // Code: 66 0F D6 /r
2119             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2120             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2121             // when applicable.
2122             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2123             emitByte(0xD6);
2124             emitOperandHelper(src, dst, 0);
2125         } else {
2126             // gpr version of movq
2127             prefixq(dst, src);
2128             emitByte(0x89);
2129             emitOperandHelper(src, dst, 0);
2130         }
2131     }
2132 
2133     public final void movsbl(Register dst, AMD64Address src) {
2134         prefix(src, dst);
2135         emitByte(0x0F);
2136         emitByte(0xBE);
2137         emitOperandHelper(dst, src, 0);
2138     }
2139 
2140     public final void movsbl(Register dst, Register src) {
2141         prefix(dst, false, src, true);
2142         emitByte(0x0F);
2143         emitByte(0xBE);
2144         emitModRM(dst, src);
2145     }
2146 
2147     public final void movsbq(Register dst, AMD64Address src) {
2148         prefixq(src, dst);
2149         emitByte(0x0F);
2150         emitByte(0xBE);
2151         emitOperandHelper(dst, src, 0);
2152     }
2153 
2154     public final void movsbq(Register dst, Register src) {
2155         prefixq(dst, src);
2156         emitByte(0x0F);
2157         emitByte(0xBE);
2158         emitModRM(dst, src);
2159     }
2160 
2161     public final void movsd(Register dst, Register src) {
2162         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2163     }
2164 
2165     public final void movsd(Register dst, AMD64Address src) {
2166         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2167     }
2168 
2169     public final void movsd(AMD64Address dst, Register src) {
2170         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2171     }
2172 
2173     public final void movss(Register dst, Register src) {
2174         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2175     }
2176 
2177     public final void movss(Register dst, AMD64Address src) {
2178         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2179     }
2180 
2181     public final void movss(AMD64Address dst, Register src) {
2182         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2183     }
2184 
2185     public final void mulpd(Register dst, Register src) {
2186         SSEOp.MUL.emit(this, PD, dst, src);
2187     }
2188 
2189     public final void mulpd(Register dst, AMD64Address src) {
2190         SSEOp.MUL.emit(this, PD, dst, src);
2191     }
2192 
2193     public final void mulsd(Register dst, Register src) {
2194         SSEOp.MUL.emit(this, SD, dst, src);
2195     }
2196 
2197     public final void mulsd(Register dst, AMD64Address src) {
2198         SSEOp.MUL.emit(this, SD, dst, src);
2199     }
2200 
2201     public final void mulss(Register dst, Register src) {
2202         SSEOp.MUL.emit(this, SS, dst, src);
2203     }
2204 
2205     public final void movswl(Register dst, AMD64Address src) {
2206         AMD64RMOp.MOVSX.emit(this, DWORD, dst, src);
2207     }
2208 
2209     public final void movswq(Register dst, AMD64Address src) {
2210         AMD64RMOp.MOVSX.emit(this, QWORD, dst, src);
2211     }
2212 
2213     public final void movw(AMD64Address dst, int imm16) {
2214         emitByte(0x66); // switch to 16-bit mode
2215         prefix(dst);
2216         emitByte(0xC7);
2217         emitOperandHelper(0, dst, 2);
2218         emitShort(imm16);
2219     }
2220 
2221     public final void movw(AMD64Address dst, Register src) {
2222         emitByte(0x66);
2223         prefix(dst, src);
2224         emitByte(0x89);
2225         emitOperandHelper(src, dst, 0);
2226     }
2227 
2228     public final void movw(Register dst, AMD64Address src) {
2229         emitByte(0x66);
2230         prefix(src, dst);
2231         emitByte(0x8B);
2232         emitOperandHelper(dst, src, 0);
2233     }
2234 
2235     public final void movzbl(Register dst, AMD64Address src) {
2236         prefix(src, dst);
2237         emitByte(0x0F);
2238         emitByte(0xB6);
2239         emitOperandHelper(dst, src, 0);
2240     }
2241 
2242     public final void movzbl(Register dst, Register src) {
2243         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2244     }
2245 
2246     public final void movzbq(Register dst, Register src) {
2247         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2248     }
2249 
2250     public final void movzbq(Register dst, AMD64Address src) {
2251         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2252     }
2253 
2254     public final void movzwl(Register dst, AMD64Address src) {
2255         AMD64RMOp.MOVZX.emit(this, DWORD, dst, src);
2256     }
2257 
2258     public final void movzwq(Register dst, AMD64Address src) {
2259         AMD64RMOp.MOVZX.emit(this, QWORD, dst, src);
2260     }
2261 
2262     public final void negl(Register dst) {
2263         NEG.emit(this, DWORD, dst);
2264     }
2265 
2266     public final void notl(Register dst) {
2267         NOT.emit(this, DWORD, dst);
2268     }
2269 
2270     public final void notq(Register dst) {
2271         NOT.emit(this, QWORD, dst);
2272     }
2273 
2274     @Override
2275     public final void ensureUniquePC() {
2276         nop();
2277     }
2278 
2279     public final void nop() {
2280         nop(1);
2281     }
2282 
2283     public void nop(int count) {
2284         int i = count;
2285         if (UseNormalNop) {
2286             assert i > 0 : " ";
2287             // The fancy nops aren't currently recognized by debuggers making it a
2288             // pain to disassemble code while debugging. If assert are on clearly
2289             // speed is not an issue so simply use the single byte traditional nop
2290             // to do alignment.
2291 
2292             for (; i > 0; i--) {
2293                 emitByte(0x90);
2294             }
2295             return;
2296         }
2297 
2298         if (UseAddressNop) {
2299             if (UseIntelNops) {
2300                 intelNops(i);
2301             } else {
2302                 amdNops(i);
2303             }
2304             return;
2305         }
2306 
2307         // Using nops with size prefixes "0x66 0x90".
2308         // From AMD Optimization Guide:
2309         // 1: 0x90
2310         // 2: 0x66 0x90
2311         // 3: 0x66 0x66 0x90
2312         // 4: 0x66 0x66 0x66 0x90
2313         // 5: 0x66 0x66 0x90 0x66 0x90
2314         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2315         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2316         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2317         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2318         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2319         //
2320         while (i > 12) {
2321             i -= 4;
2322             emitByte(0x66); // size prefix
2323             emitByte(0x66);
2324             emitByte(0x66);
2325             emitByte(0x90); // nop
2326         }
2327         // 1 - 12 nops
2328         if (i > 8) {
2329             if (i > 9) {
2330                 i -= 1;
2331                 emitByte(0x66);
2332             }
2333             i -= 3;
2334             emitByte(0x66);
2335             emitByte(0x66);
2336             emitByte(0x90);
2337         }
2338         // 1 - 8 nops
2339         if (i > 4) {
2340             if (i > 6) {
2341                 i -= 1;
2342                 emitByte(0x66);
2343             }
2344             i -= 3;
2345             emitByte(0x66);
2346             emitByte(0x66);
2347             emitByte(0x90);
2348         }
2349         switch (i) {
2350             case 4:
2351                 emitByte(0x66);
2352                 emitByte(0x66);
2353                 emitByte(0x66);
2354                 emitByte(0x90);
2355                 break;
2356             case 3:
2357                 emitByte(0x66);
2358                 emitByte(0x66);
2359                 emitByte(0x90);
2360                 break;
2361             case 2:
2362                 emitByte(0x66);
2363                 emitByte(0x90);
2364                 break;
2365             case 1:
2366                 emitByte(0x90);
2367                 break;
2368             default:
2369                 assert i == 0;
2370         }
2371     }
2372 
2373     private void amdNops(int count) {
2374         int i = count;
2375         //
2376         // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2377         // 1: 0x90
2378         // 2: 0x66 0x90
2379         // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2380         // 4: 0x0F 0x1F 0x40 0x00
2381         // 5: 0x0F 0x1F 0x44 0x00 0x00
2382         // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2383         // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2384         // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2385         // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2386         // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2387         // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2388 
2389         // The rest coding is AMD specific - use consecutive Address nops
2390 
2391         // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2392         // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2393         // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2394         // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2395         // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2396         // Size prefixes (0x66) are added for larger sizes
2397 
2398         while (i >= 22) {
2399             i -= 11;
2400             emitByte(0x66); // size prefix
2401             emitByte(0x66); // size prefix
2402             emitByte(0x66); // size prefix
2403             addrNop8();
2404         }
2405         // Generate first nop for size between 21-12
2406         switch (i) {
2407             case 21:
2408                 i -= 11;
2409                 emitByte(0x66); // size prefix
2410                 emitByte(0x66); // size prefix
2411                 emitByte(0x66); // size prefix
2412                 addrNop8();
2413                 break;
2414             case 20:
2415             case 19:
2416                 i -= 10;
2417                 emitByte(0x66); // size prefix
2418                 emitByte(0x66); // size prefix
2419                 addrNop8();
2420                 break;
2421             case 18:
2422             case 17:
2423                 i -= 9;
2424                 emitByte(0x66); // size prefix
2425                 addrNop8();
2426                 break;
2427             case 16:
2428             case 15:
2429                 i -= 8;
2430                 addrNop8();
2431                 break;
2432             case 14:
2433             case 13:
2434                 i -= 7;
2435                 addrNop7();
2436                 break;
2437             case 12:
2438                 i -= 6;
2439                 emitByte(0x66); // size prefix
2440                 addrNop5();
2441                 break;
2442             default:
2443                 assert i < 12;
2444         }
2445 
2446         // Generate second nop for size between 11-1
2447         switch (i) {
2448             case 11:
2449                 emitByte(0x66); // size prefix
2450                 emitByte(0x66); // size prefix
2451                 emitByte(0x66); // size prefix
2452                 addrNop8();
2453                 break;
2454             case 10:
2455                 emitByte(0x66); // size prefix
2456                 emitByte(0x66); // size prefix
2457                 addrNop8();
2458                 break;
2459             case 9:
2460                 emitByte(0x66); // size prefix
2461                 addrNop8();
2462                 break;
2463             case 8:
2464                 addrNop8();
2465                 break;
2466             case 7:
2467                 addrNop7();
2468                 break;
2469             case 6:
2470                 emitByte(0x66); // size prefix
2471                 addrNop5();
2472                 break;
2473             case 5:
2474                 addrNop5();
2475                 break;
2476             case 4:
2477                 addrNop4();
2478                 break;
2479             case 3:
2480                 // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2481                 emitByte(0x66); // size prefix
2482                 emitByte(0x66); // size prefix
2483                 emitByte(0x90); // nop
2484                 break;
2485             case 2:
2486                 emitByte(0x66); // size prefix
2487                 emitByte(0x90); // nop
2488                 break;
2489             case 1:
2490                 emitByte(0x90); // nop
2491                 break;
2492             default:
2493                 assert i == 0;
2494         }
2495     }
2496 
2497     @SuppressWarnings("fallthrough")
2498     private void intelNops(int count) {
2499         //
2500         // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
2501         // 1: 0x90
2502         // 2: 0x66 0x90
2503         // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2504         // 4: 0x0F 0x1F 0x40 0x00
2505         // 5: 0x0F 0x1F 0x44 0x00 0x00
2506         // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2507         // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2508         // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2509         // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2510         // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2511         // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2512 
2513         // The rest coding is Intel specific - don't use consecutive address nops
2514 
2515         // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2516         // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2517         // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2518         // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2519 
2520         int i = count;
2521         while (i >= 15) {
2522             // For Intel don't generate consecutive addess nops (mix with regular nops)
2523             i -= 15;
2524             emitByte(0x66);   // size prefix
2525             emitByte(0x66);   // size prefix
2526             emitByte(0x66);   // size prefix
2527             addrNop8();
2528             emitByte(0x66);   // size prefix
2529             emitByte(0x66);   // size prefix
2530             emitByte(0x66);   // size prefix
2531             emitByte(0x90);
2532             // nop
2533         }
2534         switch (i) {
2535             case 14:
2536                 emitByte(0x66); // size prefix
2537                 // fall through
2538             case 13:
2539                 emitByte(0x66); // size prefix
2540                 // fall through
2541             case 12:
2542                 addrNop8();
2543                 emitByte(0x66); // size prefix
2544                 emitByte(0x66); // size prefix
2545                 emitByte(0x66); // size prefix
2546                 emitByte(0x90);
2547                 // nop
2548                 break;
2549             case 11:
2550                 emitByte(0x66); // size prefix
2551                 // fall through
2552             case 10:
2553                 emitByte(0x66); // size prefix
2554                 // fall through
2555             case 9:
2556                 emitByte(0x66); // size prefix
2557                 // fall through
2558             case 8:
2559                 addrNop8();
2560                 break;
2561             case 7:
2562                 addrNop7();
2563                 break;
2564             case 6:
2565                 emitByte(0x66); // size prefix
2566                 // fall through
2567             case 5:
2568                 addrNop5();
2569                 break;
2570             case 4:
2571                 addrNop4();
2572                 break;
2573             case 3:
2574                 // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2575                 emitByte(0x66); // size prefix
2576                 // fall through
2577             case 2:
2578                 emitByte(0x66); // size prefix
2579                 // fall through
2580             case 1:
2581                 emitByte(0x90);
2582                 // nop
2583                 break;
2584             default:
2585                 assert i == 0;
2586         }
2587     }
2588 
2589     public final void orl(Register dst, Register src) {
2590         OR.rmOp.emit(this, DWORD, dst, src);
2591     }
2592 
2593     public final void orl(Register dst, int imm32) {
2594         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2595     }
2596 
2597     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2598     // -----
2599     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2600 
2601     public final void packuswb(Register dst, Register src) {
2602         assert inRC(XMM, dst) && inRC(XMM, src);
2603         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2604         simdPrefix(dst, dst, src, PD, P_0F, false);
2605         emitByte(0x67);
2606         emitModRM(dst, src);
2607     }
2608 
2609     public final void pop(Register dst) {
2610         prefix(dst);
2611         emitByte(0x58 + encode(dst));
2612     }
2613 
2614     public void popfq() {
2615         emitByte(0x9D);
2616     }
2617 
2618     public final void ptest(Register dst, Register src) {
2619         assert supports(CPUFeature.SSE4_1);
2620         assert inRC(XMM, dst) && inRC(XMM, src);
2621         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2622         emitByte(0x17);
2623         emitModRM(dst, src);
2624     }
2625 
2626     public final void pcmpeqb(Register dst, Register src) {
2627         assert supports(CPUFeature.SSE2);
2628         assert inRC(XMM, dst) && inRC(XMM, src);
2629         simdPrefix(dst, dst, src, PD, P_0F, false);
2630         emitByte(0x74);
2631         emitModRM(dst, src);
2632     }
2633 
2634     public final void pcmpeqw(Register dst, Register src) {
2635         assert supports(CPUFeature.SSE2);
2636         assert inRC(XMM, dst) && inRC(XMM, src);
2637         simdPrefix(dst, dst, src, PD, P_0F, false);
2638         emitByte(0x75);
2639         emitModRM(dst, src);
2640     }
2641 
2642     public final void pcmpeqd(Register dst, Register src) {
2643         assert supports(CPUFeature.SSE2);
2644         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2645         simdPrefix(dst, dst, src, PD, P_0F, false);
2646         emitByte(0x76);
2647         emitModRM(dst, src);
2648     }
2649 
2650     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2651         assert supports(CPUFeature.SSE4_2);
2652         assert inRC(XMM, dst);
2653         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2654         emitByte(0x61);
2655         emitOperandHelper(dst, src, 0);
2656         emitByte(imm8);
2657     }
2658 
2659     public final void pcmpestri(Register dst, Register src, int imm8) {
2660         assert supports(CPUFeature.SSE4_2);
2661         assert inRC(XMM, dst) && inRC(XMM, src);
2662         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2663         emitByte(0x61);
2664         emitModRM(dst, src);
2665         emitByte(imm8);
2666     }
2667 
2668     public final void pmovmskb(Register dst, Register src) {
2669         assert supports(CPUFeature.SSE2);
2670         assert inRC(CPU, dst) && inRC(XMM, src);
2671         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2672         emitByte(0xD7);
2673         emitModRM(dst, src);
2674     }
2675 
2676     private void pmovSZx(Register dst, AMD64Address src, int op) {
2677         assert supports(CPUFeature.SSE4_1);
2678         assert inRC(XMM, dst);
2679         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2680         emitByte(op);
2681         emitOperandHelper(dst, src, 0);
2682     }
2683 
2684     public final void pmovsxbw(Register dst, AMD64Address src) {
2685         pmovSZx(dst, src, 0x20);
2686     }
2687 
2688     public final void pmovsxbd(Register dst, AMD64Address src) {
2689         pmovSZx(dst, src, 0x21);
2690     }
2691 
2692     public final void pmovsxbq(Register dst, AMD64Address src) {
2693         pmovSZx(dst, src, 0x22);
2694     }
2695 
2696     public final void pmovsxwd(Register dst, AMD64Address src) {
2697         pmovSZx(dst, src, 0x23);
2698     }
2699 
2700     public final void pmovsxwq(Register dst, AMD64Address src) {
2701         pmovSZx(dst, src, 0x24);
2702     }
2703 
2704     public final void pmovsxdq(Register dst, AMD64Address src) {
2705         pmovSZx(dst, src, 0x25);
2706     }
2707 
2708     // Insn: VPMOVZXBW xmm1, xmm2/m64
2709     public final void pmovzxbw(Register dst, AMD64Address src) {
2710         pmovSZx(dst, src, 0x30);
2711     }
2712 
2713     public final void pmovzxbd(Register dst, AMD64Address src) {
2714         pmovSZx(dst, src, 0x31);
2715     }
2716 
2717     public final void pmovzxbq(Register dst, AMD64Address src) {
2718         pmovSZx(dst, src, 0x32);
2719     }
2720 
2721     public final void pmovzxwd(Register dst, AMD64Address src) {
2722         pmovSZx(dst, src, 0x33);
2723     }
2724 
2725     public final void pmovzxwq(Register dst, AMD64Address src) {
2726         pmovSZx(dst, src, 0x34);
2727     }
2728 
2729     public final void pmovzxdq(Register dst, AMD64Address src) {
2730         pmovSZx(dst, src, 0x35);
2731     }
2732 
2733     public final void pmovzxbw(Register dst, Register src) {
2734         assert supports(CPUFeature.SSE4_1);
2735         assert inRC(XMM, dst) && inRC(XMM, src);
2736         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2737         emitByte(0x30);
2738         emitModRM(dst, src);
2739     }
2740 
2741     public final void push(Register src) {
2742         prefix(src);
2743         emitByte(0x50 + encode(src));
2744     }
2745 
2746     public void pushfq() {
2747         emitByte(0x9c);
2748     }
2749 
2750     public final void paddd(Register dst, Register src) {
2751         assert inRC(XMM, dst) && inRC(XMM, src);
2752         simdPrefix(dst, dst, src, PD, P_0F, false);
2753         emitByte(0xFE);
2754         emitModRM(dst, src);
2755     }
2756 
2757     public final void paddq(Register dst, Register src) {
2758         assert inRC(XMM, dst) && inRC(XMM, src);
2759         simdPrefix(dst, dst, src, PD, P_0F, false);
2760         emitByte(0xD4);
2761         emitModRM(dst, src);
2762     }
2763 
2764     public final void pextrw(Register dst, Register src, int imm8) {
2765         assert inRC(CPU, dst) && inRC(XMM, src);
2766         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2767         emitByte(0xC5);
2768         emitModRM(dst, src);
2769         emitByte(imm8);
2770     }
2771 
2772     public final void pinsrw(Register dst, Register src, int imm8) {
2773         assert inRC(XMM, dst) && inRC(CPU, src);
2774         simdPrefix(dst, dst, src, PD, P_0F, false);
2775         emitByte(0xC4);
2776         emitModRM(dst, src);
2777         emitByte(imm8);
2778     }
2779 
2780     public final void por(Register dst, Register src) {
2781         assert inRC(XMM, dst) && inRC(XMM, src);
2782         simdPrefix(dst, dst, src, PD, P_0F, false);
2783         emitByte(0xEB);
2784         emitModRM(dst, src);
2785     }
2786 
2787     public final void pand(Register dst, Register src) {
2788         assert inRC(XMM, dst) && inRC(XMM, src);
2789         simdPrefix(dst, dst, src, PD, P_0F, false);
2790         emitByte(0xDB);
2791         emitModRM(dst, src);
2792     }
2793 
2794     public final void pxor(Register dst, Register src) {
2795         assert inRC(XMM, dst) && inRC(XMM, src);
2796         simdPrefix(dst, dst, src, PD, P_0F, false);
2797         emitByte(0xEF);
2798         emitModRM(dst, src);
2799     }
2800 
2801     public final void pslld(Register dst, int imm8) {
2802         assert isUByte(imm8) : "invalid value";
2803         assert inRC(XMM, dst);
2804         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2805         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2806         emitByte(0x72);
2807         emitModRM(6, dst);
2808         emitByte(imm8 & 0xFF);
2809     }
2810 
2811     public final void psllq(Register dst, Register shift) {
2812         assert inRC(XMM, dst) && inRC(XMM, shift);
2813         simdPrefix(dst, dst, shift, PD, P_0F, false);
2814         emitByte(0xF3);
2815         emitModRM(dst, shift);
2816     }
2817 
2818     public final void psllq(Register dst, int imm8) {
2819         assert isUByte(imm8) : "invalid value";
2820         assert inRC(XMM, dst);
2821         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2822         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2823         emitByte(0x73);
2824         emitModRM(6, dst);
2825         emitByte(imm8);
2826     }
2827 
2828     public final void psrad(Register dst, int imm8) {
2829         assert isUByte(imm8) : "invalid value";
2830         assert inRC(XMM, dst);
2831         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2832         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2833         emitByte(0x72);
2834         emitModRM(4, dst);
2835         emitByte(imm8);
2836     }
2837 
2838     public final void psrld(Register dst, int imm8) {
2839         assert isUByte(imm8) : "invalid value";
2840         assert inRC(XMM, dst);
2841         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2842         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2843         emitByte(0x72);
2844         emitModRM(2, dst);
2845         emitByte(imm8);
2846     }
2847 
2848     public final void psrlq(Register dst, int imm8) {
2849         assert isUByte(imm8) : "invalid value";
2850         assert inRC(XMM, dst);
2851         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2852         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2853         emitByte(0x73);
2854         emitModRM(2, dst);
2855         emitByte(imm8);
2856     }
2857 
2858     public final void psrldq(Register dst, int imm8) {
2859         assert isUByte(imm8) : "invalid value";
2860         assert inRC(XMM, dst);
2861         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2862         emitByte(0x73);
2863         emitModRM(3, dst);
2864         emitByte(imm8);
2865     }
2866 
2867     public final void pshufb(Register dst, Register src) {
2868         assert supports(CPUFeature.SSSE3);
2869         assert inRC(XMM, dst) && inRC(XMM, src);
2870         simdPrefix(dst, dst, src, PD, P_0F38, false);
2871         emitByte(0x00);
2872         emitModRM(dst, src);
2873     }
2874 
2875     public final void pshuflw(Register dst, Register src, int imm8) {
2876         assert supports(CPUFeature.SSE2);
2877         assert isUByte(imm8) : "invalid value";
2878         assert inRC(XMM, dst) && inRC(XMM, src);
2879         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2880         emitByte(0x70);
2881         emitModRM(dst, src);
2882         emitByte(imm8);
2883     }
2884 
2885     public final void pshufd(Register dst, Register src, int imm8) {
2886         assert isUByte(imm8) : "invalid value";
2887         assert inRC(XMM, dst) && inRC(XMM, src);
2888         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2889         emitByte(0x70);
2890         emitModRM(dst, src);
2891         emitByte(imm8);
2892     }
2893 
2894     public final void psubd(Register dst, Register src) {
2895         assert inRC(XMM, dst) && inRC(XMM, src);
2896         simdPrefix(dst, dst, src, PD, P_0F, false);
2897         emitByte(0xFA);
2898         emitModRM(dst, src);
2899     }
2900 
2901     public final void punpcklbw(Register dst, Register src) {
2902         assert supports(CPUFeature.SSE2);
2903         assert inRC(XMM, dst) && inRC(XMM, src);
2904         simdPrefix(dst, dst, src, PD, P_0F, false);
2905         emitByte(0x60);
2906         emitModRM(dst, src);
2907     }
2908 
2909     public final void rcpps(Register dst, Register src) {
2910         assert inRC(XMM, dst) && inRC(XMM, src);
2911         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2912         emitByte(0x53);
2913         emitModRM(dst, src);
2914     }
2915 
2916     public final void ret(int imm16) {
2917         if (imm16 == 0) {
2918             emitByte(0xC3);
2919         } else {
2920             emitByte(0xC2);
2921             emitShort(imm16);
2922         }
2923     }
2924 
2925     public final void sarl(Register dst, int imm8) {
2926         prefix(dst);
2927         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2928         if (imm8 == 1) {
2929             emitByte(0xD1);
2930             emitModRM(7, dst);
2931         } else {
2932             emitByte(0xC1);
2933             emitModRM(7, dst);
2934             emitByte(imm8);
2935         }
2936     }
2937 
2938     public final void shll(Register dst, int imm8) {
2939         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2940         prefix(dst);
2941         if (imm8 == 1) {
2942             emitByte(0xD1);
2943             emitModRM(4, dst);
2944         } else {
2945             emitByte(0xC1);
2946             emitModRM(4, dst);
2947             emitByte(imm8);
2948         }
2949     }
2950 
2951     public final void shll(Register dst) {
2952         // Multiply dst by 2, CL times.
2953         prefix(dst);
2954         emitByte(0xD3);
2955         emitModRM(4, dst);
2956     }
2957 
2958     // Insn: SHLX r32a, r/m32, r32b
2959 
2960     public final void shlxl(Register dst, Register src1, Register src2) {
2961         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2962     }
2963 
2964     public final void shrl(Register dst, int imm8) {
2965         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2966         prefix(dst);
2967         emitByte(0xC1);
2968         emitModRM(5, dst);
2969         emitByte(imm8);
2970     }
2971 
2972     public final void shrl(Register dst) {
2973         // Unsigned divide dst by 2, CL times.
2974         prefix(dst);
2975         emitByte(0xD3);
2976         emitModRM(5, dst);
2977     }
2978 
2979     public final void subl(AMD64Address dst, int imm32) {
2980         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2981     }
2982 
2983     public final void subl(Register dst, int imm32) {
2984         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2985     }
2986 
2987     public final void subl(Register dst, Register src) {
2988         SUB.rmOp.emit(this, DWORD, dst, src);
2989     }
2990 
2991     public final void subpd(Register dst, Register src) {
2992         SSEOp.SUB.emit(this, PD, dst, src);
2993     }
2994 
2995     public final void subsd(Register dst, Register src) {
2996         SSEOp.SUB.emit(this, SD, dst, src);
2997     }
2998 
2999     public final void subsd(Register dst, AMD64Address src) {
3000         SSEOp.SUB.emit(this, SD, dst, src);
3001     }
3002 
3003     public final void testl(Register dst, int imm32) {
3004         // not using emitArith because test
3005         // doesn't support sign-extension of
3006         // 8bit operands
3007         if (dst.encoding == 0) {
3008             emitByte(0xA9);
3009         } else {
3010             prefix(dst);
3011             emitByte(0xF7);
3012             emitModRM(0, dst);
3013         }
3014         emitInt(imm32);
3015     }
3016 
3017     public final void testl(Register dst, Register src) {
3018         prefix(dst, src);
3019         emitByte(0x85);
3020         emitModRM(dst, src);
3021     }
3022 
3023     public final void testl(Register dst, AMD64Address src) {
3024         prefix(src, dst);
3025         emitByte(0x85);
3026         emitOperandHelper(dst, src, 0);
3027     }
3028 
3029     public final void unpckhpd(Register dst, Register src) {
3030         assert inRC(XMM, dst) && inRC(XMM, src);
3031         simdPrefix(dst, dst, src, PD, P_0F, false);
3032         emitByte(0x15);
3033         emitModRM(dst, src);
3034     }
3035 
3036     public final void unpcklpd(Register dst, Register src) {
3037         assert inRC(XMM, dst) && inRC(XMM, src);
3038         simdPrefix(dst, dst, src, PD, P_0F, false);
3039         emitByte(0x14);
3040         emitModRM(dst, src);
3041     }
3042 
3043     public final void xorl(Register dst, Register src) {
3044         XOR.rmOp.emit(this, DWORD, dst, src);
3045     }
3046 
3047     public final void xorq(Register dst, Register src) {
3048         XOR.rmOp.emit(this, QWORD, dst, src);
3049     }
3050 
3051     public final void xorpd(Register dst, Register src) {
3052         SSEOp.XOR.emit(this, PD, dst, src);
3053     }
3054 
3055     public final void xorps(Register dst, Register src) {
3056         SSEOp.XOR.emit(this, PS, dst, src);
3057     }
3058 
3059     protected final void decl(Register dst) {
3060         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3061         prefix(dst);
3062         emitByte(0xFF);
3063         emitModRM(1, dst);
3064     }
3065 
3066     protected final void incl(Register dst) {
3067         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3068         prefix(dst);
3069         emitByte(0xFF);
3070         emitModRM(0, dst);
3071     }
3072 
3073     public final void addq(Register dst, int imm32) {
3074         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3075     }
3076 
3077     public final void addq(AMD64Address dst, int imm32) {
3078         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3079     }
3080 
3081     public final void addq(Register dst, Register src) {
3082         ADD.rmOp.emit(this, QWORD, dst, src);
3083     }
3084 
3085     public final void addq(AMD64Address dst, Register src) {
3086         ADD.mrOp.emit(this, QWORD, dst, src);
3087     }
3088 
3089     public final void andq(Register dst, int imm32) {
3090         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3091     }
3092 
3093     public final void bsrq(Register dst, Register src) {
3094         prefixq(dst, src);
3095         emitByte(0x0F);
3096         emitByte(0xBD);
3097         emitModRM(dst, src);
3098     }
3099 
3100     public final void bswapq(Register reg) {
3101         prefixq(reg);
3102         emitByte(0x0F);
3103         emitByte(0xC8 + encode(reg));
3104     }
3105 
3106     public final void cdqq() {
3107         rexw();
3108         emitByte(0x99);
3109     }
3110 
3111     public final void repStosb() {
3112         emitByte(0xf3);
3113         rexw();
3114         emitByte(0xaa);
3115     }
3116 
3117     public final void repStosq() {
3118         emitByte(0xf3);
3119         rexw();
3120         emitByte(0xab);
3121     }
3122 
3123     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
3124         prefixq(dst, src);
3125         emitByte(0x0F);
3126         emitByte(0x40 | cc.getValue());
3127         emitModRM(dst, src);
3128     }
3129 
3130     public final void setb(ConditionFlag cc, Register dst) {
3131         prefix(dst, true);
3132         emitByte(0x0F);
3133         emitByte(0x90 | cc.getValue());
3134         emitModRM(0, dst);
3135     }
3136 
3137     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
3138         prefixq(src, dst);
3139         emitByte(0x0F);
3140         emitByte(0x40 | cc.getValue());
3141         emitOperandHelper(dst, src, 0);
3142     }
3143 
3144     public final void cmpq(Register dst, int imm32) {
3145         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3146     }
3147 
3148     public final void cmpq(Register dst, Register src) {
3149         CMP.rmOp.emit(this, QWORD, dst, src);
3150     }
3151 
3152     public final void cmpq(Register dst, AMD64Address src) {
3153         CMP.rmOp.emit(this, QWORD, dst, src);
3154     }
3155 
3156     public final void cmpxchgq(Register reg, AMD64Address adr) {
3157         prefixq(adr, reg);
3158         emitByte(0x0F);
3159         emitByte(0xB1);
3160         emitOperandHelper(reg, adr, 0);
3161     }
3162 
3163     public final void cvtdq2pd(Register dst, Register src) {
3164         assert inRC(XMM, dst) && inRC(XMM, src);
3165         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3166         emitByte(0xE6);
3167         emitModRM(dst, src);
3168     }
3169 
3170     public final void cvtsi2sdq(Register dst, Register src) {
3171         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
3172     }
3173 
3174     public final void cvttsd2siq(Register dst, Register src) {
3175         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
3176     }
3177 
3178     public final void cvttpd2dq(Register dst, Register src) {
3179         assert inRC(XMM, dst) && inRC(XMM, src);
3180         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3181         emitByte(0xE6);
3182         emitModRM(dst, src);
3183     }
3184 
3185     public final void decq(Register dst) {
3186         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3187         prefixq(dst);
3188         emitByte(0xFF);
3189         emitModRM(1, dst);
3190     }
3191 
3192     public final void decq(AMD64Address dst) {
3193         DEC.emit(this, QWORD, dst);
3194     }
3195 
3196     public final void imulq(Register dst, Register src) {
3197         prefixq(dst, src);
3198         emitByte(0x0F);
3199         emitByte(0xAF);
3200         emitModRM(dst, src);
3201     }
3202 
3203     public final void incq(Register dst) {
3204         // Don't use it directly. Use Macroincrementq() instead.
3205         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3206         prefixq(dst);
3207         emitByte(0xFF);
3208         emitModRM(0, dst);
3209     }
3210 
3211     public final void incq(AMD64Address dst) {
3212         INC.emit(this, QWORD, dst);
3213     }
3214 
3215     public final void movq(Register dst, long imm64) {
3216         movq(dst, imm64, false);
3217     }
3218 
3219     public final void movq(Register dst, long imm64, boolean annotateImm) {
3220         int insnPos = position();
3221         prefixq(dst);
3222         emitByte(0xB8 + encode(dst));
3223         int immPos = position();
3224         emitLong(imm64);
3225         int nextInsnPos = position();
3226         if (annotateImm && codePatchingAnnotationConsumer != null) {
3227             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3228         }
3229     }
3230 
3231     public final void movslq(Register dst, int imm32) {
3232         prefixq(dst);
3233         emitByte(0xC7);
3234         emitModRM(0, dst);
3235         emitInt(imm32);
3236     }
3237 
3238     public final void movdq(Register dst, AMD64Address src) {
3239         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3240     }
3241 
3242     public final void movdq(AMD64Address dst, Register src) {
3243         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3244     }
3245 
3246     public final void movdq(Register dst, Register src) {
3247         if (inRC(XMM, dst) && inRC(CPU, src)) {
3248             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3249         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3250             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3251         } else {
3252             throw new InternalError("should not reach here");
3253         }
3254     }
3255 
3256     public final void movdl(Register dst, Register src) {
3257         if (inRC(XMM, dst) && inRC(CPU, src)) {
3258             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3259         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3260             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3261         } else {
3262             throw new InternalError("should not reach here");
3263         }
3264     }
3265 
3266     public final void movdl(Register dst, AMD64Address src) {
3267         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3268     }
3269 
3270     public final void movddup(Register dst, Register src) {
3271         assert supports(CPUFeature.SSE3);
3272         assert inRC(XMM, dst) && inRC(XMM, src);
3273         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3274         emitByte(0x12);
3275         emitModRM(dst, src);
3276     }
3277 
3278     public final void movdqu(Register dst, AMD64Address src) {
3279         assert inRC(XMM, dst);
3280         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3281         emitByte(0x6F);
3282         emitOperandHelper(dst, src, 0);
3283     }
3284 
3285     public final void movdqu(Register dst, Register src) {
3286         assert inRC(XMM, dst) && inRC(XMM, src);
3287         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3288         emitByte(0x6F);
3289         emitModRM(dst, src);
3290     }
3291 
3292     // Insn: VMOVDQU xmm2/m128, xmm1
3293 
3294     public final void movdqu(AMD64Address dst, Register src) {
3295         assert inRC(XMM, src);
3296         // Code: VEX.128.F3.0F.WIG 7F /r
3297         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3298         emitByte(0x7F);
3299         emitOperandHelper(src, dst, 0);
3300     }
3301 
3302     public final void movslq(AMD64Address dst, int imm32) {
3303         prefixq(dst);
3304         emitByte(0xC7);
3305         emitOperandHelper(0, dst, 4);
3306         emitInt(imm32);
3307     }
3308 
3309     public final void movslq(Register dst, AMD64Address src) {
3310         prefixq(src, dst);
3311         emitByte(0x63);
3312         emitOperandHelper(dst, src, 0);
3313     }
3314 
3315     public final void movslq(Register dst, Register src) {
3316         prefixq(dst, src);
3317         emitByte(0x63);
3318         emitModRM(dst, src);
3319     }
3320 
3321     public final void negq(Register dst) {
3322         prefixq(dst);
3323         emitByte(0xF7);
3324         emitModRM(3, dst);
3325     }
3326 
3327     public final void orq(Register dst, Register src) {
3328         OR.rmOp.emit(this, QWORD, dst, src);
3329     }
3330 
3331     public final void shlq(Register dst, int imm8) {
3332         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3333         prefixq(dst);
3334         if (imm8 == 1) {
3335             emitByte(0xD1);
3336             emitModRM(4, dst);
3337         } else {
3338             emitByte(0xC1);
3339             emitModRM(4, dst);
3340             emitByte(imm8);
3341         }
3342     }
3343 
3344     public final void shlq(Register dst) {
3345         // Multiply dst by 2, CL times.
3346         prefixq(dst);
3347         emitByte(0xD3);
3348         emitModRM(4, dst);
3349     }
3350 
3351     public final void shrq(Register dst, int imm8) {
3352         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3353         prefixq(dst);
3354         if (imm8 == 1) {
3355             emitByte(0xD1);
3356             emitModRM(5, dst);
3357         } else {
3358             emitByte(0xC1);
3359             emitModRM(5, dst);
3360             emitByte(imm8);
3361         }
3362     }
3363 
3364     public final void shrq(Register dst) {
3365         prefixq(dst);
3366         emitByte(0xD3);
3367         // Unsigned divide dst by 2, CL times.
3368         emitModRM(5, dst);
3369     }
3370 
3371     public final void sarq(Register dst, int imm8) {
3372         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3373         prefixq(dst);
3374         if (imm8 == 1) {
3375             emitByte(0xD1);
3376             emitModRM(7, dst);
3377         } else {
3378             emitByte(0xC1);
3379             emitModRM(7, dst);
3380             emitByte(imm8);
3381         }
3382     }
3383 
3384     public final void sbbq(Register dst, Register src) {
3385         SBB.rmOp.emit(this, QWORD, dst, src);
3386     }
3387 
3388     public final void subq(Register dst, int imm32) {
3389         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3390     }
3391 
3392     public final void subq(AMD64Address dst, int imm32) {
3393         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3394     }
3395 
3396     public final void subqWide(Register dst, int imm32) {
3397         // don't use the sign-extending version, forcing a 32-bit immediate
3398         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3399     }
3400 
3401     public final void subq(Register dst, Register src) {
3402         SUB.rmOp.emit(this, QWORD, dst, src);
3403     }
3404 
3405     public final void testq(Register dst, Register src) {
3406         prefixq(dst, src);
3407         emitByte(0x85);
3408         emitModRM(dst, src);
3409     }
3410 
3411     public final void btrq(Register src, int imm8) {
3412         prefixq(src);
3413         emitByte(0x0F);
3414         emitByte(0xBA);
3415         emitModRM(6, src);
3416         emitByte(imm8);
3417     }
3418 
3419     public final void xaddb(AMD64Address dst, Register src) {
3420         prefixb(dst, src);
3421         emitByte(0x0F);
3422         emitByte(0xC0);
3423         emitOperandHelper(src, dst, 0);
3424     }
3425 
3426     public final void xaddw(AMD64Address dst, Register src) {
3427         emitByte(0x66); // Switch to 16-bit mode.
3428         prefix(dst, src);
3429         emitByte(0x0F);
3430         emitByte(0xC1);
3431         emitOperandHelper(src, dst, 0);
3432     }
3433 
3434     public final void xaddl(AMD64Address dst, Register src) {
3435         prefix(dst, src);
3436         emitByte(0x0F);
3437         emitByte(0xC1);
3438         emitOperandHelper(src, dst, 0);
3439     }
3440 
3441     public final void xaddq(AMD64Address dst, Register src) {
3442         prefixq(dst, src);
3443         emitByte(0x0F);
3444         emitByte(0xC1);
3445         emitOperandHelper(src, dst, 0);
3446     }
3447 
3448     public final void xchgb(Register dst, AMD64Address src) {
3449         prefixb(src, dst);
3450         emitByte(0x86);
3451         emitOperandHelper(dst, src, 0);
3452     }
3453 
3454     public final void xchgw(Register dst, AMD64Address src) {
3455         emitByte(0x66);
3456         prefix(src, dst);
3457         emitByte(0x87);
3458         emitOperandHelper(dst, src, 0);
3459     }
3460 
3461     public final void xchgl(Register dst, AMD64Address src) {
3462         prefix(src, dst);
3463         emitByte(0x87);
3464         emitOperandHelper(dst, src, 0);
3465     }
3466 
3467     public final void xchgq(Register dst, AMD64Address src) {
3468         prefixq(src, dst);
3469         emitByte(0x87);
3470         emitOperandHelper(dst, src, 0);
3471     }
3472 
3473     public final void membar(int barriers) {
3474         if (target.isMP) {
3475             // We only have to handle StoreLoad
3476             if ((barriers & STORE_LOAD) != 0) {
3477                 // All usable chips support "locked" instructions which suffice
3478                 // as barriers, and are much faster than the alternative of
3479                 // using cpuid instruction. We use here a locked add [rsp],0.
3480                 // This is conveniently otherwise a no-op except for blowing
3481                 // flags.
3482                 // Any change to this code may need to revisit other places in
3483                 // the code where this idiom is used, in particular the
3484                 // orderAccess code.
3485                 lock();
3486                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3487             }
3488         }
3489     }
3490 
3491     @Override
3492     protected final void patchJumpTarget(int branch, int branchTarget) {
3493         int op = getByte(branch);
3494         assert op == 0xE8 // call
3495                         || op == 0x00 // jump table entry
3496                         || op == 0xE9 // jmp
3497                         || op == 0xEB // short jmp
3498                         || (op & 0xF0) == 0x70 // short jcc
3499                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3500         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3501 
3502         if (op == 0x00) {
3503             int offsetToJumpTableBase = getShort(branch + 1);
3504             int jumpTableBase = branch - offsetToJumpTableBase;
3505             int imm32 = branchTarget - jumpTableBase;
3506             emitInt(imm32, branch);
3507         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3508 
3509             // short offset operators (jmp and jcc)
3510             final int imm8 = branchTarget - (branch + 2);
3511             /*
3512              * Since a wrongly patched short branch can potentially lead to working but really bad
3513              * behaving code we should always fail with an exception instead of having an assert.
3514              */
3515             GraalError.guarantee(isByte(imm8), "Displacement too large to be encoded as a byte: %d", imm8);
3516             emitByte(imm8, branch + 1);
3517 
3518         } else {
3519 
3520             int off = 1;
3521             if (op == 0x0F) {
3522                 off = 2;
3523             }
3524 
3525             int imm32 = branchTarget - (branch + 4 + off);
3526             emitInt(imm32, branch + off);
3527         }
3528     }
3529 
3530     public void nullCheck(AMD64Address address) {
3531         testl(AMD64.rax, address);
3532     }
3533 
3534     @Override
3535     public void align(int modulus) {
3536         if (position() % modulus != 0) {
3537             nop(modulus - (position() % modulus));
3538         }
3539     }
3540 
3541     /**
3542      * Emits a direct call instruction. Note that the actual call target is not specified, because
3543      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3544      * responsible to add the call address to the appropriate patching tables.
3545      */
3546     public final void call() {
3547         annotatePatchingImmediate(1, 4);
3548         emitByte(0xE8);
3549         emitInt(0);
3550     }
3551 
3552     public final void call(Register src) {
3553         prefix(src);
3554         emitByte(0xFF);
3555         emitModRM(2, src);
3556     }
3557 
3558     public final void int3() {
3559         emitByte(0xCC);
3560     }
3561 
3562     public final void pause() {
3563         emitByte(0xF3);
3564         emitByte(0x90);
3565     }
3566 
3567     private void emitx87(int b1, int b2, int i) {
3568         assert 0 <= i && i < 8 : "illegal stack offset";
3569         emitByte(b1);
3570         emitByte(b2 + i);
3571     }
3572 
3573     public final void fldd(AMD64Address src) {
3574         emitByte(0xDD);
3575         emitOperandHelper(0, src, 0);
3576     }
3577 
3578     public final void flds(AMD64Address src) {
3579         emitByte(0xD9);
3580         emitOperandHelper(0, src, 0);
3581     }
3582 
3583     public final void fldln2() {
3584         emitByte(0xD9);
3585         emitByte(0xED);
3586     }
3587 
3588     public final void fldlg2() {
3589         emitByte(0xD9);
3590         emitByte(0xEC);
3591     }
3592 
3593     public final void fyl2x() {
3594         emitByte(0xD9);
3595         emitByte(0xF1);
3596     }
3597 
3598     public final void fstps(AMD64Address src) {
3599         emitByte(0xD9);
3600         emitOperandHelper(3, src, 0);
3601     }
3602 
3603     public final void fstpd(AMD64Address src) {
3604         emitByte(0xDD);
3605         emitOperandHelper(3, src, 0);
3606     }
3607 
3608     private void emitFPUArith(int b1, int b2, int i) {
3609         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3610         emitByte(b1);
3611         emitByte(b2 + i);
3612     }
3613 
3614     public void ffree(int i) {
3615         emitFPUArith(0xDD, 0xC0, i);
3616     }
3617 
3618     public void fincstp() {
3619         emitByte(0xD9);
3620         emitByte(0xF7);
3621     }
3622 
3623     public void fxch(int i) {
3624         emitFPUArith(0xD9, 0xC8, i);
3625     }
3626 
3627     public void fnstswAX() {
3628         emitByte(0xDF);
3629         emitByte(0xE0);
3630     }
3631 
3632     public void fwait() {
3633         emitByte(0x9B);
3634     }
3635 
3636     public void fprem() {
3637         emitByte(0xD9);
3638         emitByte(0xF8);
3639     }
3640 
3641     public final void fsin() {
3642         emitByte(0xD9);
3643         emitByte(0xFE);
3644     }
3645 
3646     public final void fcos() {
3647         emitByte(0xD9);
3648         emitByte(0xFF);
3649     }
3650 
3651     public final void fptan() {
3652         emitByte(0xD9);
3653         emitByte(0xF2);
3654     }
3655 
3656     public final void fstp(int i) {
3657         emitx87(0xDD, 0xD8, i);
3658     }
3659 
3660     @Override
3661     public AMD64Address makeAddress(Register base, int displacement) {
3662         return new AMD64Address(base, displacement);
3663     }
3664 
3665     @Override
3666     public AMD64Address getPlaceholder(int instructionStartPosition) {
3667         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3668     }
3669 
3670     private void prefetchPrefix(AMD64Address src) {
3671         prefix(src);
3672         emitByte(0x0F);
3673     }
3674 
3675     public void prefetchnta(AMD64Address src) {
3676         prefetchPrefix(src);
3677         emitByte(0x18);
3678         emitOperandHelper(0, src, 0);
3679     }
3680 
3681     void prefetchr(AMD64Address src) {
3682         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3683         prefetchPrefix(src);
3684         emitByte(0x0D);
3685         emitOperandHelper(0, src, 0);
3686     }
3687 
3688     public void prefetcht0(AMD64Address src) {
3689         assert supports(CPUFeature.SSE);
3690         prefetchPrefix(src);
3691         emitByte(0x18);
3692         emitOperandHelper(1, src, 0);
3693     }
3694 
3695     public void prefetcht1(AMD64Address src) {
3696         assert supports(CPUFeature.SSE);
3697         prefetchPrefix(src);
3698         emitByte(0x18);
3699         emitOperandHelper(2, src, 0);
3700     }
3701 
3702     public void prefetcht2(AMD64Address src) {
3703         assert supports(CPUFeature.SSE);
3704         prefix(src);
3705         emitByte(0x0f);
3706         emitByte(0x18);
3707         emitOperandHelper(3, src, 0);
3708     }
3709 
3710     public void prefetchw(AMD64Address src) {
3711         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3712         prefix(src);
3713         emitByte(0x0f);
3714         emitByte(0x0D);
3715         emitOperandHelper(1, src, 0);
3716     }
3717 
3718     public void rdtsc() {
3719         emitByte(0x0F);
3720         emitByte(0x31);
3721     }
3722 
3723     /**
3724      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3725      * to crash the program (debugging etc.).
3726      */
3727     public void illegal() {
3728         emitByte(0x0f);
3729         emitByte(0x0b);
3730     }
3731 
3732     public void lfence() {
3733         emitByte(0x0f);
3734         emitByte(0xae);
3735         emitByte(0xe8);
3736     }
3737 
3738     public final void vptest(Register dst, Register src) {
3739         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3740     }
3741 
3742     public final void vpxor(Register dst, Register nds, Register src) {
3743         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3744     }
3745 
3746     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3747         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3748     }
3749 
3750     public final void vmovdqu(Register dst, AMD64Address src) {
3751         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3752     }
3753 
3754     public final void vmovdqu(AMD64Address dst, Register src) {
3755         assert inRC(XMM, src);
3756         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3757     }
3758 
3759     public final void vpmovzxbw(Register dst, AMD64Address src) {
3760         assert supports(CPUFeature.AVX2);
3761         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3762     }
3763 
3764     public final void vzeroupper() {
3765         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3766         emitByte(0x77);
3767     }
3768 
3769     // Insn: KORTESTD k1, k2
3770 
3771     // This instruction produces ZF or CF flags
3772     public final void kortestd(Register src1, Register src2) {
3773         assert supports(CPUFeature.AVX512BW);
3774         assert inRC(MASK, src1) && inRC(MASK, src2);
3775         // Code: VEX.L0.66.0F.W1 98 /r
3776         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3777         emitByte(0x98);
3778         emitModRM(src1, src2);
3779     }
3780 
3781     // Insn: KORTESTQ k1, k2
3782 
3783     // This instruction produces ZF or CF flags
3784     public final void kortestq(Register src1, Register src2) {
3785         assert supports(CPUFeature.AVX512BW);
3786         assert inRC(MASK, src1) && inRC(MASK, src2);
3787         // Code: VEX.L0.0F.W1 98 /r
3788         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3789         emitByte(0x98);
3790         emitModRM(src1, src2);
3791     }
3792 
3793     public final void kmovd(Register dst, Register src) {
3794         assert supports(CPUFeature.AVX512BW);
3795         assert inRC(MASK, dst) || inRC(CPU, dst);
3796         assert inRC(MASK, src) || inRC(CPU, src);
3797         assert !(inRC(CPU, dst) && inRC(CPU, src));
3798 
3799         if (inRC(MASK, dst)) {
3800             if (inRC(MASK, src)) {
3801                 // kmovd(KRegister dst, KRegister src):
3802                 // Insn: KMOVD k1, k2/m32
3803                 // Code: VEX.L0.66.0F.W1 90 /r
3804                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3805                 emitByte(0x90);
3806                 emitModRM(dst, src);
3807             } else {
3808                 // kmovd(KRegister dst, Register src)
3809                 // Insn: KMOVD k1, r32
3810                 // Code: VEX.L0.F2.0F.W0 92 /r
3811                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3812                 emitByte(0x92);
3813                 emitModRM(dst, src);
3814             }
3815         } else {
3816             if (inRC(MASK, src)) {
3817                 // kmovd(Register dst, KRegister src)
3818                 // Insn: KMOVD r32, k1
3819                 // Code: VEX.L0.F2.0F.W0 93 /r
3820                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3821                 emitByte(0x93);
3822                 emitModRM(dst, src);
3823             } else {
3824                 throw GraalError.shouldNotReachHere();
3825             }
3826         }
3827     }
3828 
3829     public final void kmovq(Register dst, Register src) {
3830         assert supports(CPUFeature.AVX512BW);
3831         assert inRC(MASK, dst) || inRC(CPU, dst);
3832         assert inRC(MASK, src) || inRC(CPU, src);
3833         assert !(inRC(CPU, dst) && inRC(CPU, src));
3834 
3835         if (inRC(MASK, dst)) {
3836             if (inRC(MASK, src)) {
3837                 // kmovq(KRegister dst, KRegister src):
3838                 // Insn: KMOVQ k1, k2/m64
3839                 // Code: VEX.L0.0F.W1 90 /r
3840                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3841                 emitByte(0x90);
3842                 emitModRM(dst, src);
3843             } else {
3844                 // kmovq(KRegister dst, Register src)
3845                 // Insn: KMOVQ k1, r64
3846                 // Code: VEX.L0.F2.0F.W1 92 /r
3847                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3848                 emitByte(0x92);
3849                 emitModRM(dst, src);
3850             }
3851         } else {
3852             if (inRC(MASK, src)) {
3853                 // kmovq(Register dst, KRegister src)
3854                 // Insn: KMOVQ r64, k1
3855                 // Code: VEX.L0.F2.0F.W1 93 /r
3856                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3857                 emitByte(0x93);
3858                 emitModRM(dst, src);
3859             } else {
3860                 throw GraalError.shouldNotReachHere();
3861             }
3862         }
3863     }
3864 
3865     // Insn: KTESTD k1, k2
3866 
3867     public final void ktestd(Register src1, Register src2) {
3868         assert supports(CPUFeature.AVX512BW);
3869         assert inRC(MASK, src1) && inRC(MASK, src2);
3870         // Code: VEX.L0.66.0F.W1 99 /r
3871         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3872         emitByte(0x99);
3873         emitModRM(src1, src2);
3874     }
3875 
3876     public final void evmovdqu64(Register dst, AMD64Address src) {
3877         assert supports(CPUFeature.AVX512F);
3878         assert inRC(XMM, dst);
3879         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3880         emitByte(0x6F);
3881         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3882     }
3883 
3884     // Insn: VPMOVZXBW zmm1, m256
3885 
3886     public final void evpmovzxbw(Register dst, AMD64Address src) {
3887         assert supports(CPUFeature.AVX512BW);
3888         assert inRC(XMM, dst);
3889         // Code: EVEX.512.66.0F38.WIG 30 /r
3890         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3891         emitByte(0x30);
3892         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3893     }
3894 
3895     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3896         assert supports(CPUFeature.AVX512BW);
3897         assert inRC(MASK, kdst) && inRC(XMM, nds);
3898         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3899         emitByte(0x74);
3900         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3901     }
3902 
3903     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3904     // -----
3905     // Insn: VMOVDQU16 zmm1, m512
3906 
3907     public final void evmovdqu16(Register dst, AMD64Address src) {
3908         assert supports(CPUFeature.AVX512BW);
3909         assert inRC(XMM, dst);
3910         // Code: EVEX.512.F2.0F.W1 6F /r
3911         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3912         emitByte(0x6F);
3913         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3914     }
3915 
3916     // Insn: VMOVDQU16 zmm1, k1:z, m512
3917 
3918     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3919         assert supports(CPUFeature.AVX512BW);
3920         assert inRC(XMM, dst) && inRC(MASK, mask);
3921         // Code: EVEX.512.F2.0F.W1 6F /r
3922         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3923         emitByte(0x6F);
3924         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3925     }
3926 
3927     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3928     // -----
3929     // Insn: VMOVDQU16 m512, zmm1
3930 
3931     public final void evmovdqu16(AMD64Address dst, Register src) {
3932         assert supports(CPUFeature.AVX512BW);
3933         assert inRC(XMM, src);
3934         // Code: EVEX.512.F2.0F.W1 7F /r
3935         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3936         emitByte(0x7F);
3937         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3938     }
3939 
3940     // Insn: VMOVDQU16 m512, k1, zmm1
3941 
3942     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3943         assert supports(CPUFeature.AVX512BW);
3944         assert inRC(MASK, mask) && inRC(XMM, src);
3945         // Code: EVEX.512.F2.0F.W1 7F /r
3946         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3947         emitByte(0x7F);
3948         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3949     }
3950 
3951     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3952     // -----
3953     // Insn: VPBROADCASTW zmm1, reg
3954 
3955     public final void evpbroadcastw(Register dst, Register src) {
3956         assert supports(CPUFeature.AVX512BW);
3957         assert inRC(XMM, dst) && inRC(CPU, src);
3958         // Code: EVEX.512.66.0F38.W0 7B /r
3959         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3960         emitByte(0x7B);
3961         emitModRM(dst, src);
3962     }
3963 
3964     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3965     // -----
3966     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3967 
3968     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3969         assert supports(CPUFeature.AVX512BW);
3970         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3971         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3972         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3973         emitByte(0x3E);
3974         emitModRM(kdst, src);
3975         emitByte(vcc);
3976     }
3977 
3978     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3979     // -----
3980     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3981 
3982     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3983         assert supports(CPUFeature.AVX512BW);
3984         assert inRC(MASK, kdst) && inRC(MASK, mask);
3985         assert inRC(XMM, nds) && inRC(XMM, src);
3986         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3987         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3988         emitByte(0x3E);
3989         emitModRM(kdst, src);
3990         emitByte(vcc);
3991     }
3992 
3993     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3994     // -----
3995     // Insn: VPMOVWB m256, zmm2
3996 
3997     public final void evpmovwb(AMD64Address dst, Register src) {
3998         assert supports(CPUFeature.AVX512BW);
3999         assert inRC(XMM, src);
4000         // Code: EVEX.512.F3.0F38.W0 30 /r
4001         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
4002         emitByte(0x30);
4003         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
4004     }
4005 
4006     // Insn: VPMOVWB m256, k1, zmm2
4007 
4008     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
4009         assert supports(CPUFeature.AVX512BW);
4010         assert inRC(MASK, mask) && inRC(XMM, src);
4011         // Code: EVEX.512.F3.0F38.W0 30 /r
4012         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
4013         emitByte(0x30);
4014         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
4015     }
4016 
4017     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
4018     // -----
4019     // Insn: VPMOVZXBW zmm1, k1, m256
4020 
4021     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
4022         assert supports(CPUFeature.AVX512BW);
4023         assert inRC(MASK, mask) && inRC(XMM, dst);
4024         // Code: EVEX.512.66.0F38.WIG 30 /r
4025         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
4026         emitByte(0x30);
4027         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
4028     }
4029 
4030 }