1 /*
   2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  68 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  69 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  70 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  71 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  72 
  73 import java.util.EnumSet;
  74 
  75 import org.graalvm.compiler.asm.Label;
  76 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  77 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  78 import org.graalvm.compiler.core.common.NumUtil;
  79 import org.graalvm.compiler.core.common.calc.Condition;
  80 import org.graalvm.compiler.debug.GraalError;
  81 
  82 import jdk.vm.ci.amd64.AMD64;
  83 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  84 import jdk.vm.ci.code.Register;
  85 import jdk.vm.ci.code.Register.RegisterCategory;
  86 import jdk.vm.ci.code.TargetDescription;
  87 
  88 /**
  89  * This class implements an assembler that can encode most X86 instructions.
  90  */
  91 public class AMD64Assembler extends AMD64BaseAssembler {
  92 
  93     /**
  94      * Constructs an assembler for the AMD64 architecture.
  95      */
  96     public AMD64Assembler(TargetDescription target) {
  97         super(target);
  98     }
  99 
 100     /**
 101      * The x86 condition codes used for conditional jumps/moves.
 102      */
 103     public enum ConditionFlag {
 104         Zero(0x4, "|zero|"),
 105         NotZero(0x5, "|nzero|"),
 106         Equal(0x4, "="),
 107         NotEqual(0x5, "!="),
 108         Less(0xc, "<"),
 109         LessEqual(0xe, "<="),
 110         Greater(0xf, ">"),
 111         GreaterEqual(0xd, ">="),
 112         Below(0x2, "|<|"),
 113         BelowEqual(0x6, "|<=|"),
 114         Above(0x7, "|>|"),
 115         AboveEqual(0x3, "|>=|"),
 116         Overflow(0x0, "|of|"),
 117         NoOverflow(0x1, "|nof|"),
 118         CarrySet(0x2, "|carry|"),
 119         CarryClear(0x3, "|ncarry|"),
 120         Negative(0x8, "|neg|"),
 121         Positive(0x9, "|pos|"),
 122         Parity(0xa, "|par|"),
 123         NoParity(0xb, "|npar|");
 124 
 125         private final int value;
 126         private final String operator;
 127 
 128         ConditionFlag(int value, String operator) {
 129             this.value = value;
 130             this.operator = operator;
 131         }
 132 
 133         public ConditionFlag negate() {
 134             switch (this) {
 135                 case Zero:
 136                     return NotZero;
 137                 case NotZero:
 138                     return Zero;
 139                 case Equal:
 140                     return NotEqual;
 141                 case NotEqual:
 142                     return Equal;
 143                 case Less:
 144                     return GreaterEqual;
 145                 case LessEqual:
 146                     return Greater;
 147                 case Greater:
 148                     return LessEqual;
 149                 case GreaterEqual:
 150                     return Less;
 151                 case Below:
 152                     return AboveEqual;
 153                 case BelowEqual:
 154                     return Above;
 155                 case Above:
 156                     return BelowEqual;
 157                 case AboveEqual:
 158                     return Below;
 159                 case Overflow:
 160                     return NoOverflow;
 161                 case NoOverflow:
 162                     return Overflow;
 163                 case CarrySet:
 164                     return CarryClear;
 165                 case CarryClear:
 166                     return CarrySet;
 167                 case Negative:
 168                     return Positive;
 169                 case Positive:
 170                     return Negative;
 171                 case Parity:
 172                     return NoParity;
 173                 case NoParity:
 174                     return Parity;
 175             }
 176             throw new IllegalArgumentException();
 177         }
 178 
 179         public int getValue() {
 180             return value;
 181         }
 182 
 183         @Override
 184         public String toString() {
 185             return operator;
 186         }
 187     }
 188 
 189     /**
 190      * Operand size and register type constraints.
 191      */
 192     private enum OpAssertion {
 193         ByteAssertion(CPU, CPU, BYTE),
 194         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 195         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 196         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 197         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 198         QwordAssertion(CPU, CPU, QWORD),
 199         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 200         PackedFloatAssertion(XMM, XMM, PS, PD),
 201         SingleAssertion(XMM, XMM, SS),
 202         DoubleAssertion(XMM, XMM, SD),
 203         PackedDoubleAssertion(XMM, XMM, PD),
 204         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 205         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 206 
 207         private final RegisterCategory resultCategory;
 208         private final RegisterCategory inputCategory;
 209         private final OperandSize[] allowedSizes;
 210 
 211         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 212             this.resultCategory = resultCategory;
 213             this.inputCategory = inputCategory;
 214             this.allowedSizes = allowedSizes;
 215         }
 216 
 217         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 218             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 219             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 220 
 221             for (OperandSize s : allowedSizes) {
 222                 if (size == s) {
 223                     return true;
 224                 }
 225             }
 226 
 227             assert false : "invalid operand size " + size + " used in " + op;
 228             return false;
 229         }
 230 
 231     }
 232 
 233     protected static final int P_0F = 0x0F;
 234     protected static final int P_0F38 = 0x380F;
 235     protected static final int P_0F3A = 0x3A0F;
 236 
 237     /**
 238      * Base class for AMD64 opcodes.
 239      */
 240     public static class AMD64Op {
 241 
 242         private final String opcode;
 243 
 244         protected final int prefix1;
 245         protected final int prefix2;
 246         protected final int op;
 247 
 248         private final boolean dstIsByte;
 249         private final boolean srcIsByte;
 250 
 251         private final OpAssertion assertion;
 252         private final CPUFeature feature;
 253 
 254         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 255             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 256         }
 257 
 258         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 259             this.opcode = opcode;
 260             this.prefix1 = prefix1;
 261             this.prefix2 = prefix2;
 262             this.op = op;
 263 
 264             this.dstIsByte = dstIsByte;
 265             this.srcIsByte = srcIsByte;
 266 
 267             this.assertion = assertion;
 268             this.feature = feature;
 269         }
 270 
 271         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 272             if (prefix1 != 0) {
 273                 asm.emitByte(prefix1);
 274             }
 275             if (size.getSizePrefix() != 0) {
 276                 asm.emitByte(size.getSizePrefix());
 277             }
 278             int rexPrefix = 0x40 | rxb;
 279             if (size == QWORD) {
 280                 rexPrefix |= 0x08;
 281             }
 282             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 283                 asm.emitByte(rexPrefix);
 284             }
 285             if (prefix2 > 0xFF) {
 286                 asm.emitShort(prefix2);
 287             } else if (prefix2 > 0) {
 288                 asm.emitByte(prefix2);
 289             }
 290             asm.emitByte(op);
 291         }
 292 
 293         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 294             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 295             assert assertion.checkOperands(this, size, resultReg, inputReg);
 296             return true;
 297         }
 298 
 299         public OperandSize[] getAllowedSizes() {
 300             return assertion.allowedSizes;
 301         }
 302 
 303         protected final boolean isSSEInstruction() {
 304             if (feature == null) {
 305                 return false;
 306             }
 307             switch (feature) {
 308                 case SSE:
 309                 case SSE2:
 310                 case SSE3:
 311                 case SSSE3:
 312                 case SSE4A:
 313                 case SSE4_1:
 314                 case SSE4_2:
 315                     return true;
 316                 default:
 317                     return false;
 318             }
 319         }
 320 
 321         public final OpAssertion getAssertion() {
 322             return assertion;
 323         }
 324 
 325         @Override
 326         public String toString() {
 327             return opcode;
 328         }
 329     }
 330 
 331     /**
 332      * Base class for AMD64 opcodes with immediate operands.
 333      */
 334     public static class AMD64ImmOp extends AMD64Op {
 335 
 336         private final boolean immIsByte;
 337 
 338         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 339             this(opcode, immIsByte, prefix, op, assertion, null);
 340         }
 341 
 342         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 343             super(opcode, 0, prefix, op, assertion, feature);
 344             this.immIsByte = immIsByte;
 345         }
 346 
 347         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 348             if (immIsByte) {
 349                 assert imm == (byte) imm;
 350                 asm.emitByte(imm);
 351             } else {
 352                 size.emitImmediate(asm, imm);
 353             }
 354         }
 355 
 356         protected final int immediateSize(OperandSize size) {
 357             if (immIsByte) {
 358                 return 1;
 359             } else {
 360                 return size.getBytes();
 361             }
 362         }
 363     }
 364 
 365     /**
 366      * Opcode with operand order of either RM or MR for 2 address forms.
 367      */
 368     public abstract static class AMD64RROp extends AMD64Op {
 369 
 370         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 371             super(opcode, prefix1, prefix2, op, assertion, feature);
 372         }
 373 
 374         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 375             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 376         }
 377 
 378         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 379     }
 380 
 381     /**
 382      * Opcode with operand order of RM.
 383      */
 384     public static class AMD64RMOp extends AMD64RROp {
 385         // @formatter:off
 386         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 387         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 388         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 389         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 390         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 391         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 392         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 393         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 394         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 395         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 396         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 399         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 400         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 401         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 402         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 403 
 404         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 405         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 407         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 409 
 410         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 411         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 412         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 413         // @formatter:on
 414 
 415         protected AMD64RMOp(String opcode, int op) {
 416             this(opcode, 0, op);
 417         }
 418 
 419         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 420             this(opcode, 0, op, assertion);
 421         }
 422 
 423         protected AMD64RMOp(String opcode, int prefix, int op) {
 424             this(opcode, 0, prefix, op, null);
 425         }
 426 
 427         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 428             this(opcode, 0, prefix, op, assertion, null);
 429         }
 430 
 431         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 432             this(opcode, 0, prefix, op, assertion, feature);
 433         }
 434 
 435         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 436             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 437         }
 438 
 439         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 440             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 441         }
 442 
 443         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 444             super(opcode, prefix1, prefix2, op, assertion, feature);
 445         }
 446 
 447         @Override
 448         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 449             assert verify(asm, size, dst, src);
 450             if (isSSEInstruction()) {
 451                 Register nds = Register.None;
 452                 switch (op) {
 453                     case 0x10:
 454                     case 0x51:
 455                         if ((size == SS) || (size == SD)) {
 456                             nds = dst;
 457                         }
 458                         break;
 459                     case 0x2A:
 460                     case 0x54:
 461                     case 0x55:
 462                     case 0x56:
 463                     case 0x57:
 464                     case 0x58:
 465                     case 0x59:
 466                     case 0x5A:
 467                     case 0x5C:
 468                     case 0x5D:
 469                     case 0x5E:
 470                     case 0x5F:
 471                         nds = dst;
 472                         break;
 473                     default:
 474                         break;
 475                 }
 476                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 477                 asm.emitByte(op);
 478                 asm.emitModRM(dst, src);
 479             } else {
 480                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 481                 asm.emitModRM(dst, src);
 482             }
 483         }
 484 
 485         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 486             assert verify(asm, size, dst, null);
 487             if (isSSEInstruction()) {
 488                 Register nds = Register.None;
 489                 switch (op) {
 490                     case 0x51:
 491                         if ((size == SS) || (size == SD)) {
 492                             nds = dst;
 493                         }
 494                         break;
 495                     case 0x2A:
 496                     case 0x54:
 497                     case 0x55:
 498                     case 0x56:
 499                     case 0x57:
 500                     case 0x58:
 501                     case 0x59:
 502                     case 0x5A:
 503                     case 0x5C:
 504                     case 0x5D:
 505                     case 0x5E:
 506                     case 0x5F:
 507                         nds = dst;
 508                         break;
 509                     default:
 510                         break;
 511                 }
 512                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 513                 asm.emitByte(op);
 514                 asm.emitOperandHelper(dst, src, 0);
 515             } else {
 516                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 517                 asm.emitOperandHelper(dst, src, 0);
 518             }
 519         }
 520     }
 521 
 522     /**
 523      * Opcode with operand order of MR.
 524      */
 525     public static class AMD64MROp extends AMD64RROp {
 526         // @formatter:off
 527         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 528         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 529 
 530         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 531         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 532         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 534 
 535         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 536         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 538         // @formatter:on
 539 
 540         protected AMD64MROp(String opcode, int op) {
 541             this(opcode, 0, op);
 542         }
 543 
 544         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 545             this(opcode, 0, op, assertion);
 546         }
 547 
 548         protected AMD64MROp(String opcode, int prefix, int op) {
 549             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 550         }
 551 
 552         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 553             this(opcode, prefix, op, assertion, null);
 554         }
 555 
 556         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 557             this(opcode, 0, prefix, op, assertion, feature);
 558         }
 559 
 560         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 561             super(opcode, prefix1, prefix2, op, assertion, feature);
 562         }
 563 
 564         @Override
 565         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 566             assert verify(asm, size, src, dst);
 567             if (isSSEInstruction()) {
 568                 Register nds = Register.None;
 569                 switch (op) {
 570                     case 0x11:
 571                         if ((size == SS) || (size == SD)) {
 572                             nds = src;
 573                         }
 574                         break;
 575                     default:
 576                         break;
 577                 }
 578                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 579                 asm.emitByte(op);
 580                 asm.emitModRM(src, dst);
 581             } else {
 582                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 583                 asm.emitModRM(src, dst);
 584             }
 585         }
 586 
 587         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 588             assert verify(asm, size, src, null);
 589             if (isSSEInstruction()) {
 590                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 591                 asm.emitByte(op);
 592             } else {
 593                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 594             }
 595             asm.emitOperandHelper(src, dst, 0);
 596         }
 597     }
 598 
 599     /**
 600      * Opcodes with operand order of M.
 601      */
 602     public static class AMD64MOp extends AMD64Op {
 603         // @formatter:off
 604         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 605         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 606         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 607         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 608         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 609         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 610         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 611         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 612         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 613         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 614         // @formatter:on
 615 
 616         private final int ext;
 617 
 618         protected AMD64MOp(String opcode, int op, int ext) {
 619             this(opcode, 0, op, ext);
 620         }
 621 
 622         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 623             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 624         }
 625 
 626         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 627             this(opcode, 0, op, ext, assertion);
 628         }
 629 
 630         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 631             super(opcode, 0, prefix, op, assertion, null);
 632             this.ext = ext;
 633         }
 634 
 635         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 636             assert verify(asm, size, dst, null);
 637             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 638             asm.emitModRM(ext, dst);
 639         }
 640 
 641         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 642             assert verify(asm, size, null, null);
 643             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 644             asm.emitOperandHelper(ext, dst, 0);
 645         }
 646     }
 647 
 648     /**
 649      * Opcodes with operand order of MI.
 650      */
 651     public static class AMD64MIOp extends AMD64ImmOp {
 652         // @formatter:off
 653         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 654         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 655         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 656         // @formatter:on
 657 
 658         private final int ext;
 659 
 660         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 661             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 662         }
 663 
 664         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 665             this(opcode, immIsByte, 0, op, ext, assertion);
 666         }
 667 
 668         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 669             super(opcode, immIsByte, prefix, op, assertion);
 670             this.ext = ext;
 671         }
 672 
 673         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 674             emit(asm, size, dst, imm, false);
 675         }
 676 
 677         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 678             assert verify(asm, size, dst, null);
 679             int insnPos = asm.position();
 680             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 681             asm.emitModRM(ext, dst);
 682             int immPos = asm.position();
 683             emitImmediate(asm, size, imm);
 684             int nextInsnPos = asm.position();
 685             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 686                 asm.codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 687             }
 688         }
 689 
 690         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 691             emit(asm, size, dst, imm, false);
 692         }
 693 
 694         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 695             assert verify(asm, size, null, null);
 696             int insnPos = asm.position();
 697             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 698             asm.emitOperandHelper(ext, dst, immediateSize(size));
 699             int immPos = asm.position();
 700             emitImmediate(asm, size, imm);
 701             int nextInsnPos = asm.position();
 702             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 703                 asm.codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 704             }
 705         }
 706     }
 707 
 708     /**
 709      * Opcodes with operand order of RMI.
 710      *
 711      * We only have one form of round as the operation is always treated with single variant input,
 712      * making its extension to 3 address forms redundant.
 713      */
 714     public static class AMD64RMIOp extends AMD64ImmOp {
 715         // @formatter:off
 716         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 717         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 718         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 720         // @formatter:on
 721 
 722         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 723             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 724         }
 725 
 726         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 727             super(opcode, immIsByte, prefix, op, assertion, feature);
 728         }
 729 
 730         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 731             assert verify(asm, size, dst, src);
 732             if (isSSEInstruction()) {
 733                 Register nds = Register.None;
 734                 switch (op) {
 735                     case 0x0A:
 736                     case 0x0B:
 737                         nds = dst;
 738                         break;
 739                     default:
 740                         break;
 741                 }
 742                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 743                 asm.emitByte(op);
 744                 asm.emitModRM(dst, src);
 745             } else {
 746                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 747                 asm.emitModRM(dst, src);
 748             }
 749             emitImmediate(asm, size, imm);
 750         }
 751 
 752         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 753             assert verify(asm, size, dst, null);
 754             if (isSSEInstruction()) {
 755                 Register nds = Register.None;
 756                 switch (op) {
 757                     case 0x0A:
 758                     case 0x0B:
 759                         nds = dst;
 760                         break;
 761                     default:
 762                         break;
 763                 }
 764                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 765                 asm.emitByte(op);
 766             } else {
 767                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 768             }
 769             asm.emitOperandHelper(dst, src, immediateSize(size));
 770             emitImmediate(asm, size, imm);
 771         }
 772     }
 773 
 774     public static class SSEOp extends AMD64RMOp {
 775         // @formatter:off
 776         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 778         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 780         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 781         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 782         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 786         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 787         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 788         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 789         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 790         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 791         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 792         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 793         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 794         // @formatter:on
 795 
 796         protected SSEOp(String opcode, int prefix, int op) {
 797             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 798         }
 799 
 800         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 801             this(opcode, 0, prefix, op, assertion);
 802         }
 803 
 804         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 805             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 806         }
 807     }
 808 
 809     /**
 810      * Arithmetic operation with operand order of RM, MR or MI.
 811      */
 812     public static final class AMD64BinaryArithmetic {
 813         // @formatter:off
 814         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 815         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 816         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 817         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 818         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 819         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 820         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 821         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 822         // @formatter:on
 823 
 824         private final AMD64MIOp byteImmOp;
 825         private final AMD64MROp byteMrOp;
 826         private final AMD64RMOp byteRmOp;
 827 
 828         private final AMD64MIOp immOp;
 829         private final AMD64MIOp immSxOp;
 830         private final AMD64MROp mrOp;
 831         private final AMD64RMOp rmOp;
 832 
 833         private AMD64BinaryArithmetic(String opcode, int code) {
 834             int baseOp = code << 3;
 835 
 836             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 837             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 838             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 839 
 840             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 841             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 842             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 843             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 844         }
 845 
 846         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 847             if (size == BYTE) {
 848                 return byteImmOp;
 849             } else if (sx) {
 850                 return immSxOp;
 851             } else {
 852                 return immOp;
 853             }
 854         }
 855 
 856         public AMD64MROp getMROpcode(OperandSize size) {
 857             if (size == BYTE) {
 858                 return byteMrOp;
 859             } else {
 860                 return mrOp;
 861             }
 862         }
 863 
 864         public AMD64RMOp getRMOpcode(OperandSize size) {
 865             if (size == BYTE) {
 866                 return byteRmOp;
 867             } else {
 868                 return rmOp;
 869             }
 870         }
 871     }
 872 
 873     /**
 874      * Shift operation with operand order of M1, MC or MI.
 875      */
 876     public static final class AMD64Shift {
 877         // @formatter:off
 878         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 879         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 880         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 881         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 882         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 883         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 884         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 885         // @formatter:on
 886 
 887         public final AMD64MOp m1Op;
 888         public final AMD64MOp mcOp;
 889         public final AMD64MIOp miOp;
 890 
 891         private AMD64Shift(String opcode, int code) {
 892             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 893             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 894             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 895         }
 896     }
 897 
 898     private enum VEXOpAssertion {
 899         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 900         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 901         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 902         AVX1_128ONLY(CPUFeature.AVX, null),
 903         AVX1_256ONLY(null, CPUFeature.AVX),
 904         AVX2_256ONLY(null, CPUFeature.AVX2),
 905         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 906         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 907         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 908         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 909         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 910         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null);
 911 
 912         private final CPUFeature l128feature;
 913         private final CPUFeature l256feature;
 914 
 915         private final RegisterCategory rCategory;
 916         private final RegisterCategory vCategory;
 917         private final RegisterCategory mCategory;
 918         private final RegisterCategory imm8Category;
 919 
 920         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 921             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 922         }
 923 
 924         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 925             this.l128feature = l128feature;
 926             this.l256feature = l256feature;
 927             this.rCategory = rCategory;
 928             this.vCategory = vCategory;
 929             this.mCategory = mCategory;
 930             this.imm8Category = imm8Category;
 931         }
 932 
 933         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 934             return check(arch, getLFlag(size), r, v, m, null);
 935         }
 936 
 937         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 938             return check(arch, getLFlag(size), r, v, m, imm8);
 939         }
 940 
 941         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 942             switch (l) {
 943                 case L128:
 944                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 945                     break;
 946                 case L256:
 947                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 948                     break;
 949             }
 950             if (r != null) {
 951                 assert r.getRegisterCategory().equals(rCategory);
 952             }
 953             if (v != null) {
 954                 assert v.getRegisterCategory().equals(vCategory);
 955             }
 956             if (m != null) {
 957                 assert m.getRegisterCategory().equals(mCategory);
 958             }
 959             if (imm8 != null) {
 960                 assert imm8.getRegisterCategory().equals(imm8Category);
 961             }
 962             return true;
 963         }
 964 
 965         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 966             switch (avxSize) {
 967                 case XMM:
 968                     return l128feature != null && features.contains(l128feature);
 969                 case YMM:
 970                     return l256feature != null && features.contains(l256feature);
 971                 default:
 972                     throw GraalError.shouldNotReachHere();
 973             }
 974         }
 975     }
 976 
 977     /**
 978      * Base class for VEX-encoded instructions.
 979      */
 980     public static class VexOp {
 981         protected final int pp;
 982         protected final int mmmmm;
 983         protected final int w;
 984         protected final int op;
 985 
 986         private final String opcode;
 987         protected final VEXOpAssertion assertion;
 988 
 989         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 990             this.pp = pp;
 991             this.mmmmm = mmmmm;
 992             this.w = w;
 993             this.op = op;
 994             this.opcode = opcode;
 995             this.assertion = assertion;
 996         }
 997 
 998         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 999             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
1000         }
1001 
1002         @Override
1003         public String toString() {
1004             return opcode;
1005         }
1006     }
1007 
1008     /**
1009      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1010      */
1011     public static class VexRROp extends VexOp {
1012         // @formatter:off
1013         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1014         // @formatter:on
1015 
1016         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1017             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1018         }
1019 
1020         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1021             super(opcode, pp, mmmmm, w, op, assertion);
1022         }
1023 
1024         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1025             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1026             assert op != 0x1A || op != 0x5A;
1027             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1028             asm.emitByte(op);
1029             asm.emitModRM(dst, src);
1030         }
1031     }
1032 
1033     /**
1034      * VEX-encoded instructions with an operand order of RM.
1035      */
1036     public static class VexRMOp extends VexRROp {
1037         // @formatter:off
1038         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1042         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1044         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1046         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1048         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1049         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1051         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1052         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1056         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1057         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1058         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1059         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1060         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1061         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1062         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1063         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1064         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1065         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1066         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1067         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1068         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1069         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1070         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1074         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1075         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1076         // @formatter:on
1077 
1078         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1079             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1080         }
1081 
1082         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1083             super(opcode, pp, mmmmm, w, op, assertion);
1084         }
1085 
1086         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1087             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1088             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1089             asm.emitByte(op);
1090             asm.emitOperandHelper(dst, src, 0);
1091         }
1092     }
1093 
1094     /**
1095      * VEX-encoded move instructions.
1096      * <p>
1097      * These instructions have two opcodes: op is the forward move instruction with an operand order
1098      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1099      */
1100     public static final class VexMoveOp extends VexRMOp {
1101         // @formatter:off
1102         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1104         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1106         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1110         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1112         // @formatter:on
1113 
1114         private final int opReverse;
1115 
1116         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1117             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1118         }
1119 
1120         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1121             super(opcode, pp, mmmmm, w, op, assertion);
1122             this.opReverse = opReverse;
1123         }
1124 
1125         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1126             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1127             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1128             asm.emitByte(opReverse);
1129             asm.emitOperandHelper(src, dst, 0);
1130         }
1131 
1132         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1133             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1134             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1135             asm.emitByte(opReverse);
1136             asm.emitModRM(src, dst);
1137         }
1138     }
1139 
1140     public interface VexRRIOp {
1141         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1142     }
1143 
1144     /**
1145      * VEX-encoded instructions with an operand order of RMI.
1146      */
1147     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1148         // @formatter:off
1149         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1150         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1153         // @formatter:on
1154 
1155         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1156             super(opcode, pp, mmmmm, w, op, assertion);
1157         }
1158 
1159         @Override
1160         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1161             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1162             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1163             asm.emitByte(op);
1164             asm.emitModRM(dst, src);
1165             asm.emitByte(imm8);
1166         }
1167 
1168         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1169             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1170             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1171             asm.emitByte(op);
1172             asm.emitOperandHelper(dst, src, 1);
1173             asm.emitByte(imm8);
1174         }
1175     }
1176 
1177     /**
1178      * VEX-encoded instructions with an operand order of MRI.
1179      */
1180     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1181         // @formatter:off
1182         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1183         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1184         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1187         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1188         // @formatter:on
1189 
1190         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1191             super(opcode, pp, mmmmm, w, op, assertion);
1192         }
1193 
1194         @Override
1195         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1196             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1197             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1198             asm.emitByte(op);
1199             asm.emitModRM(src, dst);
1200             asm.emitByte(imm8);
1201         }
1202 
1203         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1204             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1205             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1206             asm.emitByte(op);
1207             asm.emitOperandHelper(src, dst, 1);
1208             asm.emitByte(imm8);
1209         }
1210     }
1211 
1212     /**
1213      * VEX-encoded instructions with an operand order of RVMR.
1214      */
1215     public static class VexRVMROp extends VexOp {
1216         // @formatter:off
1217         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1218         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1219         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1220         // @formatter:on
1221 
1222         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1223             super(opcode, pp, mmmmm, w, op, assertion);
1224         }
1225 
1226         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1227             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1228             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1229             asm.emitByte(op);
1230             asm.emitModRM(dst, src2);
1231             asm.emitByte(mask.encoding() << 4);
1232         }
1233 
1234         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1235             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1236             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1237             asm.emitByte(op);
1238             asm.emitOperandHelper(dst, src2, 0);
1239             asm.emitByte(mask.encoding() << 4);
1240         }
1241     }
1242 
1243     /**
1244      * VEX-encoded instructions with an operand order of RVM.
1245      */
1246     public static class VexRVMOp extends VexOp {
1247         // @formatter:off
1248         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1250         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1252         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1254         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1256         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1260         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1264         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1268         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1272         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1276         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1280         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1282         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1298         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1300         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1304         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1311         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1312         // @formatter:on
1313 
1314         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1315             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1316         }
1317 
1318         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1319             super(opcode, pp, mmmmm, w, op, assertion);
1320         }
1321 
1322         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1323             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1324             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1325             asm.emitByte(op);
1326             asm.emitModRM(dst, src2);
1327         }
1328 
1329         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1330             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1331             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1332             asm.emitByte(op);
1333             asm.emitOperandHelper(dst, src2, 0);
1334         }
1335     }
1336 
1337     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1338         // @formatter:off
1339         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1340         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1341         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1342         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1343         // @formatter:on
1344 
1345         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1346             super(opcode, pp, mmmmm, w, op, assertion);
1347         }
1348 
1349         @Override
1350         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1351             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1352             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1353             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1354             asm.emitByte(op);
1355             asm.emitModRM(dst, src2);
1356         }
1357 
1358         @Override
1359         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1360             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1361             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1362             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1363             asm.emitByte(op);
1364             asm.emitOperandHelper(dst, src2, 0);
1365         }
1366     }
1367 
1368     public static final class VexGeneralPurposeRMVOp extends VexOp {
1369         // @formatter:off
1370         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1371         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1372         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1373         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1374         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1375         // @formatter:on
1376 
1377         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1378             super(opcode, pp, mmmmm, w, op, assertion);
1379         }
1380 
1381         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1382             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1383             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1384             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1385             asm.emitByte(op);
1386             asm.emitModRM(dst, src1);
1387         }
1388 
1389         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1390             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1391             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1392             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1393             asm.emitByte(op);
1394             asm.emitOperandHelper(dst, src1, 0);
1395         }
1396     }
1397 
1398     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1399         // @formatter:off
1400         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1401         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1402         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1403         // @formatter:on
1404         private final int ext;
1405 
1406         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1407             super(opcode, pp, mmmmm, w, op, assertion);
1408             this.ext = ext;
1409         }
1410 
1411         @Override
1412         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1413             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1414             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1415             asm.emitByte(op);
1416             asm.emitModRM(ext, src);
1417         }
1418 
1419         @Override
1420         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1421             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1422             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1423             asm.emitByte(op);
1424             asm.emitOperandHelper(ext, src, 0);
1425         }
1426     }
1427 
1428     /**
1429      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1430      */
1431     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1432         // @formatter:off
1433         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1434         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1435         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1436         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1437         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1438         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1439         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1440         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1441         // @formatter:on
1442 
1443         private final int immOp;
1444         private final int r;
1445 
1446         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1447             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1448             this.immOp = immOp;
1449             this.r = r;
1450         }
1451 
1452         @Override
1453         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1454             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1455             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1456             asm.emitByte(immOp);
1457             asm.emitModRM(r, src);
1458             asm.emitByte(imm8);
1459         }
1460     }
1461 
1462     public static final class VexMaskMoveOp extends VexOp {
1463         // @formatter:off
1464         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1465         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1466         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1467         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1468         // @formatter:on
1469 
1470         private final int opReverse;
1471 
1472         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1473             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1474         }
1475 
1476         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1477             super(opcode, pp, mmmmm, w, op, assertion);
1478             this.opReverse = opReverse;
1479         }
1480 
1481         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1482             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1483             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1484             asm.emitByte(op);
1485             asm.emitOperandHelper(dst, src, 0);
1486         }
1487 
1488         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1489             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1490             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1491             asm.emitByte(opReverse);
1492             asm.emitOperandHelper(src, dst, 0);
1493         }
1494     }
1495 
1496     /**
1497      * VEX-encoded instructions with an operand order of RVMI.
1498      */
1499     public static final class VexRVMIOp extends VexOp {
1500         // @formatter:off
1501         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1502         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1503         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1504         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1505         // @formatter:on
1506 
1507         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1508             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1509         }
1510 
1511         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1512             super(opcode, pp, mmmmm, w, op, assertion);
1513         }
1514 
1515         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1516             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1517             assert (imm8 & 0xFF) == imm8;
1518             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1519             asm.emitByte(op);
1520             asm.emitModRM(dst, src2);
1521             asm.emitByte(imm8);
1522         }
1523 
1524         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1525             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1526             assert (imm8 & 0xFF) == imm8;
1527             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1528             asm.emitByte(op);
1529             asm.emitOperandHelper(dst, src2, 1);
1530             asm.emitByte(imm8);
1531         }
1532     }
1533 
1534     /**
1535      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1536      * comparison operator.
1537      */
1538     public static final class VexFloatCompareOp extends VexOp {
1539         // @formatter:off
1540         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1541         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1542         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1543         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1544         // @formatter:on
1545 
1546         public enum Predicate {
1547             EQ_OQ(0x00),
1548             LT_OS(0x01),
1549             LE_OS(0x02),
1550             UNORD_Q(0x03),
1551             NEQ_UQ(0x04),
1552             NLT_US(0x05),
1553             NLE_US(0x06),
1554             ORD_Q(0x07),
1555             EQ_UQ(0x08),
1556             NGE_US(0x09),
1557             NGT_US(0x0a),
1558             FALSE_OQ(0x0b),
1559             NEQ_OQ(0x0c),
1560             GE_OS(0x0d),
1561             GT_OS(0x0e),
1562             TRUE_UQ(0x0f),
1563             EQ_OS(0x10),
1564             LT_OQ(0x11),
1565             LE_OQ(0x12),
1566             UNORD_S(0x13),
1567             NEQ_US(0x14),
1568             NLT_UQ(0x15),
1569             NLE_UQ(0x16),
1570             ORD_S(0x17),
1571             EQ_US(0x18),
1572             NGE_UQ(0x19),
1573             NGT_UQ(0x1a),
1574             FALSE_OS(0x1b),
1575             NEQ_OS(0x1c),
1576             GE_OQ(0x1d),
1577             GT_OQ(0x1e),
1578             TRUE_US(0x1f);
1579 
1580             private int imm8;
1581 
1582             Predicate(int imm8) {
1583                 this.imm8 = imm8;
1584             }
1585 
1586             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1587                 if (unorderedIsTrue) {
1588                     switch (condition) {
1589                         case EQ:
1590                             return EQ_UQ;
1591                         case NE:
1592                             return NEQ_UQ;
1593                         case LT:
1594                             return NGE_UQ;
1595                         case LE:
1596                             return NGT_UQ;
1597                         case GT:
1598                             return NLE_UQ;
1599                         case GE:
1600                             return NLT_UQ;
1601                         default:
1602                             throw GraalError.shouldNotReachHere();
1603                     }
1604                 } else {
1605                     switch (condition) {
1606                         case EQ:
1607                             return EQ_OQ;
1608                         case NE:
1609                             return NEQ_OQ;
1610                         case LT:
1611                             return LT_OQ;
1612                         case LE:
1613                             return LE_OQ;
1614                         case GT:
1615                             return GT_OQ;
1616                         case GE:
1617                             return GE_OQ;
1618                         default:
1619                             throw GraalError.shouldNotReachHere();
1620                     }
1621                 }
1622             }
1623         }
1624 
1625         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1626             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1627         }
1628 
1629         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1630             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1631             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1632             asm.emitByte(op);
1633             asm.emitModRM(dst, src2);
1634             asm.emitByte(p.imm8);
1635         }
1636 
1637         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1638             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1639             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1640             asm.emitByte(op);
1641             asm.emitOperandHelper(dst, src2, 1);
1642             asm.emitByte(p.imm8);
1643         }
1644     }
1645 
1646     public final void addl(AMD64Address dst, int imm32) {
1647         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1648     }
1649 
1650     public final void addl(Register dst, int imm32) {
1651         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1652     }
1653 
1654     public final void addl(Register dst, Register src) {
1655         ADD.rmOp.emit(this, DWORD, dst, src);
1656     }
1657 
1658     public final void addpd(Register dst, Register src) {
1659         SSEOp.ADD.emit(this, PD, dst, src);
1660     }
1661 
1662     public final void addpd(Register dst, AMD64Address src) {
1663         SSEOp.ADD.emit(this, PD, dst, src);
1664     }
1665 
1666     public final void addsd(Register dst, Register src) {
1667         SSEOp.ADD.emit(this, SD, dst, src);
1668     }
1669 
1670     public final void addsd(Register dst, AMD64Address src) {
1671         SSEOp.ADD.emit(this, SD, dst, src);
1672     }
1673 
1674     private void addrNop4() {
1675         // 4 bytes: NOP DWORD PTR [EAX+0]
1676         emitByte(0x0F);
1677         emitByte(0x1F);
1678         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1679         emitByte(0); // 8-bits offset (1 byte)
1680     }
1681 
1682     private void addrNop5() {
1683         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1684         emitByte(0x0F);
1685         emitByte(0x1F);
1686         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1687         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1688         emitByte(0); // 8-bits offset (1 byte)
1689     }
1690 
1691     private void addrNop7() {
1692         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1693         emitByte(0x0F);
1694         emitByte(0x1F);
1695         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1696         emitInt(0); // 32-bits offset (4 bytes)
1697     }
1698 
1699     private void addrNop8() {
1700         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1701         emitByte(0x0F);
1702         emitByte(0x1F);
1703         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1704         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1705         emitInt(0); // 32-bits offset (4 bytes)
1706     }
1707 
1708     public final void andl(Register dst, int imm32) {
1709         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1710     }
1711 
1712     public final void andl(Register dst, Register src) {
1713         AND.rmOp.emit(this, DWORD, dst, src);
1714     }
1715 
1716     public final void andpd(Register dst, Register src) {
1717         SSEOp.AND.emit(this, PD, dst, src);
1718     }
1719 
1720     public final void andpd(Register dst, AMD64Address src) {
1721         SSEOp.AND.emit(this, PD, dst, src);
1722     }
1723 
1724     public final void bsfq(Register dst, Register src) {
1725         prefixq(dst, src);
1726         emitByte(0x0F);
1727         emitByte(0xBC);
1728         emitModRM(dst, src);
1729     }
1730 
1731     public final void bsrl(Register dst, Register src) {
1732         prefix(dst, src);
1733         emitByte(0x0F);
1734         emitByte(0xBD);
1735         emitModRM(dst, src);
1736     }
1737 
1738     public final void bswapl(Register reg) {
1739         prefix(reg);
1740         emitByte(0x0F);
1741         emitModRM(1, reg);
1742     }
1743 
1744     public final void cdql() {
1745         emitByte(0x99);
1746     }
1747 
1748     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1749         prefix(dst, src);
1750         emitByte(0x0F);
1751         emitByte(0x40 | cc.getValue());
1752         emitModRM(dst, src);
1753     }
1754 
1755     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1756         prefix(src, dst);
1757         emitByte(0x0F);
1758         emitByte(0x40 | cc.getValue());
1759         emitOperandHelper(dst, src, 0);
1760     }
1761 
1762     public final void cmpl(Register dst, int imm32) {
1763         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1764     }
1765 
1766     public final void cmpl(Register dst, Register src) {
1767         CMP.rmOp.emit(this, DWORD, dst, src);
1768     }
1769 
1770     public final void cmpl(Register dst, AMD64Address src) {
1771         CMP.rmOp.emit(this, DWORD, dst, src);
1772     }
1773 
1774     public final void cmpl(AMD64Address dst, int imm32) {
1775         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1776     }
1777 
1778     /**
1779      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1780      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1781      * values were equal, and cleared otherwise.
1782      */
1783     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1784         prefixb(adr, reg);
1785         emitByte(0x0F);
1786         emitByte(0xB0);
1787         emitOperandHelper(reg, adr, 0);
1788     }
1789 
1790     /**
1791      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1792      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1793      * compared values were equal, and cleared otherwise.
1794      */
1795     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1796         emitByte(0x66); // Switch to 16-bit mode.
1797         prefix(adr, reg);
1798         emitByte(0x0F);
1799         emitByte(0xB1);
1800         emitOperandHelper(reg, adr, 0);
1801     }
1802 
1803     /**
1804      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1805      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1806      * compared values were equal, and cleared otherwise.
1807      */
1808     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1809         prefix(adr, reg);
1810         emitByte(0x0F);
1811         emitByte(0xB1);
1812         emitOperandHelper(reg, adr, 0);
1813     }
1814 
1815     public final void cvtsi2sdl(Register dst, Register src) {
1816         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1817     }
1818 
1819     public final void cvttsd2sil(Register dst, Register src) {
1820         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1821     }
1822 
1823     public final void decl(AMD64Address dst) {
1824         prefix(dst);
1825         emitByte(0xFF);
1826         emitOperandHelper(1, dst, 0);
1827     }
1828 
1829     public final void divsd(Register dst, Register src) {
1830         SSEOp.DIV.emit(this, SD, dst, src);
1831     }
1832 
1833     public final void hlt() {
1834         emitByte(0xF4);
1835     }
1836 
1837     public final void imull(Register dst, Register src, int value) {
1838         if (isByte(value)) {
1839             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1840         } else {
1841             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1842         }
1843     }
1844 
1845     public final void incl(AMD64Address dst) {
1846         prefix(dst);
1847         emitByte(0xFF);
1848         emitOperandHelper(0, dst, 0);
1849     }
1850 
1851     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1852         int shortSize = 2;
1853         int longSize = 6;
1854         long disp = jumpTarget - position();
1855         if (!forceDisp32 && isByte(disp - shortSize)) {
1856             // 0111 tttn #8-bit disp
1857             emitByte(0x70 | cc.getValue());
1858             emitByte((int) ((disp - shortSize) & 0xFF));
1859         } else {
1860             // 0000 1111 1000 tttn #32-bit disp
1861             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1862             emitByte(0x0F);
1863             emitByte(0x80 | cc.getValue());
1864             emitInt((int) (disp - longSize));
1865         }
1866     }
1867 
1868     public final void jcc(ConditionFlag cc, Label l) {
1869         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1870         if (l.isBound()) {
1871             jcc(cc, l.position(), false);
1872         } else {
1873             // Note: could eliminate cond. jumps to this jump if condition
1874             // is the same however, seems to be rather unlikely case.
1875             // Note: use jccb() if label to be bound is very close to get
1876             // an 8-bit displacement
1877             l.addPatchAt(position());
1878             emitByte(0x0F);
1879             emitByte(0x80 | cc.getValue());
1880             emitInt(0);
1881         }
1882 
1883     }
1884 
1885     public final void jccb(ConditionFlag cc, Label l) {
1886         if (l.isBound()) {
1887             int shortSize = 2;
1888             int entry = l.position();
1889             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1890             long disp = entry - position();
1891             // 0111 tttn #8-bit disp
1892             emitByte(0x70 | cc.getValue());
1893             emitByte((int) ((disp - shortSize) & 0xFF));
1894         } else {
1895             l.addPatchAt(position());
1896             emitByte(0x70 | cc.getValue());
1897             emitByte(0);
1898         }
1899     }
1900 
1901     public final void jmp(int jumpTarget, boolean forceDisp32) {
1902         int shortSize = 2;
1903         int longSize = 5;
1904         long disp = jumpTarget - position();
1905         if (!forceDisp32 && isByte(disp - shortSize)) {
1906             emitByte(0xEB);
1907             emitByte((int) ((disp - shortSize) & 0xFF));
1908         } else {
1909             emitByte(0xE9);
1910             emitInt((int) (disp - longSize));
1911         }
1912     }
1913 
1914     @Override
1915     public final void jmp(Label l) {
1916         if (l.isBound()) {
1917             jmp(l.position(), false);
1918         } else {
1919             // By default, forward jumps are always 32-bit displacements, since
1920             // we can't yet know where the label will be bound. If you're sure that
1921             // the forward jump will not run beyond 256 bytes, use jmpb to
1922             // force an 8-bit displacement.
1923 
1924             l.addPatchAt(position());
1925             emitByte(0xE9);
1926             emitInt(0);
1927         }
1928     }
1929 
1930     public final void jmp(Register entry) {
1931         prefix(entry);
1932         emitByte(0xFF);
1933         emitModRM(4, entry);
1934     }
1935 
1936     public final void jmp(AMD64Address adr) {
1937         prefix(adr);
1938         emitByte(0xFF);
1939         emitOperandHelper(AMD64.rsp, adr, 0);
1940     }
1941 
1942     public final void jmpb(Label l) {
1943         if (l.isBound()) {
1944             int shortSize = 2;
1945             int entry = l.position();
1946             assert isByte((entry - position()) + shortSize) : "Dispacement too large for a short jmp";
1947             long offs = entry - position();
1948             emitByte(0xEB);
1949             emitByte((int) ((offs - shortSize) & 0xFF));
1950         } else {
1951 
1952             l.addPatchAt(position());
1953             emitByte(0xEB);
1954             emitByte(0);
1955         }
1956     }
1957 
1958     public final void lead(Register dst, AMD64Address src) {
1959         prefix(src, dst);
1960         emitByte(0x8D);
1961         emitOperandHelper(dst, src, 0);
1962     }
1963 
1964     public final void leaq(Register dst, AMD64Address src) {
1965         prefixq(src, dst);
1966         emitByte(0x8D);
1967         emitOperandHelper(dst, src, 0);
1968     }
1969 
1970     public final void leave() {
1971         emitByte(0xC9);
1972     }
1973 
1974     public final void lock() {
1975         emitByte(0xF0);
1976     }
1977 
1978     public final void movapd(Register dst, Register src) {
1979         assert inRC(XMM, dst) && inRC(XMM, src);
1980         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1981         emitByte(0x28);
1982         emitModRM(dst, src);
1983     }
1984 
1985     public final void movaps(Register dst, Register src) {
1986         assert inRC(XMM, dst) && inRC(XMM, src);
1987         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1988         emitByte(0x28);
1989         emitModRM(dst, src);
1990     }
1991 
1992     public final void movb(AMD64Address dst, int imm8) {
1993         prefix(dst);
1994         emitByte(0xC6);
1995         emitOperandHelper(0, dst, 1);
1996         emitByte(imm8);
1997     }
1998 
1999     public final void movb(AMD64Address dst, Register src) {
2000         assert inRC(CPU, src) : "must have byte register";
2001         prefixb(dst, src);
2002         emitByte(0x88);
2003         emitOperandHelper(src, dst, 0);
2004     }
2005 
2006     public final void movl(Register dst, int imm32) {
2007         movl(dst, imm32, false);
2008     }
2009 
2010     public final void movl(Register dst, int imm32, boolean annotateImm) {
2011         int insnPos = position();
2012         prefix(dst);
2013         emitByte(0xB8 + encode(dst));
2014         int immPos = position();
2015         emitInt(imm32);
2016         int nextInsnPos = position();
2017         if (annotateImm && codePatchingAnnotationConsumer != null) {
2018             codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2019         }
2020     }
2021 
2022     public final void movl(Register dst, Register src) {
2023         prefix(dst, src);
2024         emitByte(0x8B);
2025         emitModRM(dst, src);
2026     }
2027 
2028     public final void movl(Register dst, AMD64Address src) {
2029         prefix(src, dst);
2030         emitByte(0x8B);
2031         emitOperandHelper(dst, src, 0);
2032     }
2033 
2034     /**
2035      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2036      */
2037     public final void movl(Register dst, AMD64Address src, boolean wide) {
2038         prefix(src, dst);
2039         emitByte(0x8B);
2040         emitOperandHelper(dst, src, wide, 0);
2041     }
2042 
2043     public final void movl(AMD64Address dst, int imm32) {
2044         prefix(dst);
2045         emitByte(0xC7);
2046         emitOperandHelper(0, dst, 4);
2047         emitInt(imm32);
2048     }
2049 
2050     public final void movl(AMD64Address dst, Register src) {
2051         prefix(dst, src);
2052         emitByte(0x89);
2053         emitOperandHelper(src, dst, 0);
2054     }
2055 
2056     /**
2057      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2058      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2059      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2060      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2061      */
2062     public final void movlpd(Register dst, AMD64Address src) {
2063         assert inRC(XMM, dst);
2064         simdPrefix(dst, dst, src, PD, P_0F, false);
2065         emitByte(0x12);
2066         emitOperandHelper(dst, src, 0);
2067     }
2068 
2069     public final void movlhps(Register dst, Register src) {
2070         assert inRC(XMM, dst) && inRC(XMM, src);
2071         simdPrefix(dst, src, src, PS, P_0F, false);
2072         emitByte(0x16);
2073         emitModRM(dst, src);
2074     }
2075 
2076     public final void movq(Register dst, AMD64Address src) {
2077         movq(dst, src, false);
2078     }
2079 
2080     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2081         if (inRC(XMM, dst)) {
2082             // Insn: MOVQ xmm, r/m64
2083             // Code: F3 0F 7E /r
2084             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2085             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2086             // when applicable.
2087             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2088             emitByte(0x7E);
2089             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2090         } else {
2091             // gpr version of movq
2092             prefixq(src, dst);
2093             emitByte(0x8B);
2094             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2095         }
2096     }
2097 
2098     public final void movq(Register dst, Register src) {
2099         assert inRC(CPU, dst) && inRC(CPU, src);
2100         prefixq(dst, src);
2101         emitByte(0x8B);
2102         emitModRM(dst, src);
2103     }
2104 
2105     public final void movq(AMD64Address dst, Register src) {
2106         if (inRC(XMM, src)) {
2107             // Insn: MOVQ r/m64, xmm
2108             // Code: 66 0F D6 /r
2109             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2110             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2111             // when applicable.
2112             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2113             emitByte(0xD6);
2114             emitOperandHelper(src, dst, 0);
2115         } else {
2116             // gpr version of movq
2117             prefixq(dst, src);
2118             emitByte(0x89);
2119             emitOperandHelper(src, dst, 0);
2120         }
2121     }
2122 
2123     public final void movsbl(Register dst, AMD64Address src) {
2124         prefix(src, dst);
2125         emitByte(0x0F);
2126         emitByte(0xBE);
2127         emitOperandHelper(dst, src, 0);
2128     }
2129 
2130     public final void movsbl(Register dst, Register src) {
2131         prefix(dst, false, src, true);
2132         emitByte(0x0F);
2133         emitByte(0xBE);
2134         emitModRM(dst, src);
2135     }
2136 
2137     public final void movsbq(Register dst, AMD64Address src) {
2138         prefixq(src, dst);
2139         emitByte(0x0F);
2140         emitByte(0xBE);
2141         emitOperandHelper(dst, src, 0);
2142     }
2143 
2144     public final void movsbq(Register dst, Register src) {
2145         prefixq(dst, src);
2146         emitByte(0x0F);
2147         emitByte(0xBE);
2148         emitModRM(dst, src);
2149     }
2150 
2151     public final void movsd(Register dst, Register src) {
2152         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2153     }
2154 
2155     public final void movsd(Register dst, AMD64Address src) {
2156         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2157     }
2158 
2159     public final void movsd(AMD64Address dst, Register src) {
2160         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2161     }
2162 
2163     public final void movss(Register dst, Register src) {
2164         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2165     }
2166 
2167     public final void movss(Register dst, AMD64Address src) {
2168         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2169     }
2170 
2171     public final void movss(AMD64Address dst, Register src) {
2172         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2173     }
2174 
2175     public final void mulpd(Register dst, Register src) {
2176         SSEOp.MUL.emit(this, PD, dst, src);
2177     }
2178 
2179     public final void mulpd(Register dst, AMD64Address src) {
2180         SSEOp.MUL.emit(this, PD, dst, src);
2181     }
2182 
2183     public final void mulsd(Register dst, Register src) {
2184         SSEOp.MUL.emit(this, SD, dst, src);
2185     }
2186 
2187     public final void mulsd(Register dst, AMD64Address src) {
2188         SSEOp.MUL.emit(this, SD, dst, src);
2189     }
2190 
2191     public final void mulss(Register dst, Register src) {
2192         SSEOp.MUL.emit(this, SS, dst, src);
2193     }
2194 
2195     public final void movswl(Register dst, AMD64Address src) {
2196         prefix(src, dst);
2197         emitByte(0x0F);
2198         emitByte(0xBF);
2199         emitOperandHelper(dst, src, 0);
2200     }
2201 
2202     public final void movw(AMD64Address dst, int imm16) {
2203         emitByte(0x66); // switch to 16-bit mode
2204         prefix(dst);
2205         emitByte(0xC7);
2206         emitOperandHelper(0, dst, 2);
2207         emitShort(imm16);
2208     }
2209 
2210     public final void movw(AMD64Address dst, Register src) {
2211         emitByte(0x66);
2212         prefix(dst, src);
2213         emitByte(0x89);
2214         emitOperandHelper(src, dst, 0);
2215     }
2216 
2217     public final void movzbl(Register dst, AMD64Address src) {
2218         prefix(src, dst);
2219         emitByte(0x0F);
2220         emitByte(0xB6);
2221         emitOperandHelper(dst, src, 0);
2222     }
2223 
2224     public final void movzbl(Register dst, Register src) {
2225         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2226     }
2227 
2228     public final void movzbq(Register dst, Register src) {
2229         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2230     }
2231 
2232     public final void movzwl(Register dst, AMD64Address src) {
2233         prefix(src, dst);
2234         emitByte(0x0F);
2235         emitByte(0xB7);
2236         emitOperandHelper(dst, src, 0);
2237     }
2238 
2239     public final void negl(Register dst) {
2240         NEG.emit(this, DWORD, dst);
2241     }
2242 
2243     public final void notl(Register dst) {
2244         NOT.emit(this, DWORD, dst);
2245     }
2246 
2247     public final void notq(Register dst) {
2248         NOT.emit(this, QWORD, dst);
2249     }
2250 
2251     @Override
2252     public final void ensureUniquePC() {
2253         nop();
2254     }
2255 
2256     public final void nop() {
2257         nop(1);
2258     }
2259 
2260     public void nop(int count) {
2261         int i = count;
2262         if (UseNormalNop) {
2263             assert i > 0 : " ";
2264             // The fancy nops aren't currently recognized by debuggers making it a
2265             // pain to disassemble code while debugging. If assert are on clearly
2266             // speed is not an issue so simply use the single byte traditional nop
2267             // to do alignment.
2268 
2269             for (; i > 0; i--) {
2270                 emitByte(0x90);
2271             }
2272             return;
2273         }
2274 
2275         if (UseAddressNop) {
2276             //
2277             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2278             // 1: 0x90
2279             // 2: 0x66 0x90
2280             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2281             // 4: 0x0F 0x1F 0x40 0x00
2282             // 5: 0x0F 0x1F 0x44 0x00 0x00
2283             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2284             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2285             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2286             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2287             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2288             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2289 
2290             // The rest coding is AMD specific - use consecutive Address nops
2291 
2292             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2293             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2294             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2295             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2296             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2297             // Size prefixes (0x66) are added for larger sizes
2298 
2299             while (i >= 22) {
2300                 i -= 11;
2301                 emitByte(0x66); // size prefix
2302                 emitByte(0x66); // size prefix
2303                 emitByte(0x66); // size prefix
2304                 addrNop8();
2305             }
2306             // Generate first nop for size between 21-12
2307             switch (i) {
2308                 case 21:
2309                     i -= 11;
2310                     emitByte(0x66); // size prefix
2311                     emitByte(0x66); // size prefix
2312                     emitByte(0x66); // size prefix
2313                     addrNop8();
2314                     break;
2315                 case 20:
2316                 case 19:
2317                     i -= 10;
2318                     emitByte(0x66); // size prefix
2319                     emitByte(0x66); // size prefix
2320                     addrNop8();
2321                     break;
2322                 case 18:
2323                 case 17:
2324                     i -= 9;
2325                     emitByte(0x66); // size prefix
2326                     addrNop8();
2327                     break;
2328                 case 16:
2329                 case 15:
2330                     i -= 8;
2331                     addrNop8();
2332                     break;
2333                 case 14:
2334                 case 13:
2335                     i -= 7;
2336                     addrNop7();
2337                     break;
2338                 case 12:
2339                     i -= 6;
2340                     emitByte(0x66); // size prefix
2341                     addrNop5();
2342                     break;
2343                 default:
2344                     assert i < 12;
2345             }
2346 
2347             // Generate second nop for size between 11-1
2348             switch (i) {
2349                 case 11:
2350                     emitByte(0x66); // size prefix
2351                     emitByte(0x66); // size prefix
2352                     emitByte(0x66); // size prefix
2353                     addrNop8();
2354                     break;
2355                 case 10:
2356                     emitByte(0x66); // size prefix
2357                     emitByte(0x66); // size prefix
2358                     addrNop8();
2359                     break;
2360                 case 9:
2361                     emitByte(0x66); // size prefix
2362                     addrNop8();
2363                     break;
2364                 case 8:
2365                     addrNop8();
2366                     break;
2367                 case 7:
2368                     addrNop7();
2369                     break;
2370                 case 6:
2371                     emitByte(0x66); // size prefix
2372                     addrNop5();
2373                     break;
2374                 case 5:
2375                     addrNop5();
2376                     break;
2377                 case 4:
2378                     addrNop4();
2379                     break;
2380                 case 3:
2381                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2382                     emitByte(0x66); // size prefix
2383                     emitByte(0x66); // size prefix
2384                     emitByte(0x90); // nop
2385                     break;
2386                 case 2:
2387                     emitByte(0x66); // size prefix
2388                     emitByte(0x90); // nop
2389                     break;
2390                 case 1:
2391                     emitByte(0x90); // nop
2392                     break;
2393                 default:
2394                     assert i == 0;
2395             }
2396             return;
2397         }
2398 
2399         // Using nops with size prefixes "0x66 0x90".
2400         // From AMD Optimization Guide:
2401         // 1: 0x90
2402         // 2: 0x66 0x90
2403         // 3: 0x66 0x66 0x90
2404         // 4: 0x66 0x66 0x66 0x90
2405         // 5: 0x66 0x66 0x90 0x66 0x90
2406         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2407         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2408         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2409         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2410         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2411         //
2412         while (i > 12) {
2413             i -= 4;
2414             emitByte(0x66); // size prefix
2415             emitByte(0x66);
2416             emitByte(0x66);
2417             emitByte(0x90); // nop
2418         }
2419         // 1 - 12 nops
2420         if (i > 8) {
2421             if (i > 9) {
2422                 i -= 1;
2423                 emitByte(0x66);
2424             }
2425             i -= 3;
2426             emitByte(0x66);
2427             emitByte(0x66);
2428             emitByte(0x90);
2429         }
2430         // 1 - 8 nops
2431         if (i > 4) {
2432             if (i > 6) {
2433                 i -= 1;
2434                 emitByte(0x66);
2435             }
2436             i -= 3;
2437             emitByte(0x66);
2438             emitByte(0x66);
2439             emitByte(0x90);
2440         }
2441         switch (i) {
2442             case 4:
2443                 emitByte(0x66);
2444                 emitByte(0x66);
2445                 emitByte(0x66);
2446                 emitByte(0x90);
2447                 break;
2448             case 3:
2449                 emitByte(0x66);
2450                 emitByte(0x66);
2451                 emitByte(0x90);
2452                 break;
2453             case 2:
2454                 emitByte(0x66);
2455                 emitByte(0x90);
2456                 break;
2457             case 1:
2458                 emitByte(0x90);
2459                 break;
2460             default:
2461                 assert i == 0;
2462         }
2463     }
2464 
2465     public final void orl(Register dst, Register src) {
2466         OR.rmOp.emit(this, DWORD, dst, src);
2467     }
2468 
2469     public final void orl(Register dst, int imm32) {
2470         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2471     }
2472 
2473     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2474     // -----
2475     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2476 
2477     public final void packuswb(Register dst, Register src) {
2478         assert inRC(XMM, dst) && inRC(XMM, src);
2479         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2480         simdPrefix(dst, dst, src, PD, P_0F, false);
2481         emitByte(0x67);
2482         emitModRM(dst, src);
2483     }
2484 
2485     public final void pop(Register dst) {
2486         prefix(dst);
2487         emitByte(0x58 + encode(dst));
2488     }
2489 
2490     public void popfq() {
2491         emitByte(0x9D);
2492     }
2493 
2494     public final void ptest(Register dst, Register src) {
2495         assert supports(CPUFeature.SSE4_1);
2496         assert inRC(XMM, dst) && inRC(XMM, src);
2497         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2498         emitByte(0x17);
2499         emitModRM(dst, src);
2500     }
2501 
2502     public final void pcmpeqb(Register dst, Register src) {
2503         assert supports(CPUFeature.SSE2);
2504         assert inRC(XMM, dst) && inRC(XMM, src);
2505         simdPrefix(dst, dst, src, PD, P_0F, false);
2506         emitByte(0x74);
2507         emitModRM(dst, src);
2508     }
2509 
2510     public final void pcmpeqw(Register dst, Register src) {
2511         assert supports(CPUFeature.SSE2);
2512         assert inRC(XMM, dst) && inRC(XMM, src);
2513         simdPrefix(dst, dst, src, PD, P_0F, false);
2514         emitByte(0x75);
2515         emitModRM(dst, src);
2516     }
2517 
2518     public final void pcmpeqd(Register dst, Register src) {
2519         assert supports(CPUFeature.SSE2);
2520         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2521         simdPrefix(dst, dst, src, PD, P_0F, false);
2522         emitByte(0x76);
2523         emitModRM(dst, src);
2524     }
2525 
2526     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2527         assert supports(CPUFeature.SSE4_2);
2528         assert inRC(XMM, dst);
2529         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2530         emitByte(0x61);
2531         emitOperandHelper(dst, src, 0);
2532         emitByte(imm8);
2533     }
2534 
2535     public final void pcmpestri(Register dst, Register src, int imm8) {
2536         assert supports(CPUFeature.SSE4_2);
2537         assert inRC(XMM, dst) && inRC(XMM, src);
2538         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2539         emitByte(0x61);
2540         emitModRM(dst, src);
2541         emitByte(imm8);
2542     }
2543 
2544     public final void pmovmskb(Register dst, Register src) {
2545         assert supports(CPUFeature.SSE2);
2546         assert inRC(CPU, dst) && inRC(XMM, src);
2547         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2548         emitByte(0xD7);
2549         emitModRM(dst, src);
2550     }
2551 
2552     // Insn: VPMOVZXBW xmm1, xmm2/m64
2553 
2554     public final void pmovzxbw(Register dst, AMD64Address src) {
2555         assert supports(CPUFeature.SSE4_1);
2556         assert inRC(XMM, dst);
2557         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2558         emitByte(0x30);
2559         emitOperandHelper(dst, src, 0);
2560     }
2561 
2562     public final void pmovzxbw(Register dst, Register src) {
2563         assert supports(CPUFeature.SSE4_1);
2564         assert inRC(XMM, dst) && inRC(XMM, src);
2565         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2566         emitByte(0x30);
2567         emitModRM(dst, src);
2568     }
2569 
2570     public final void push(Register src) {
2571         prefix(src);
2572         emitByte(0x50 + encode(src));
2573     }
2574 
2575     public void pushfq() {
2576         emitByte(0x9c);
2577     }
2578 
2579     public final void paddd(Register dst, Register src) {
2580         assert inRC(XMM, dst) && inRC(XMM, src);
2581         simdPrefix(dst, dst, src, PD, P_0F, false);
2582         emitByte(0xFE);
2583         emitModRM(dst, src);
2584     }
2585 
2586     public final void paddq(Register dst, Register src) {
2587         assert inRC(XMM, dst) && inRC(XMM, src);
2588         simdPrefix(dst, dst, src, PD, P_0F, false);
2589         emitByte(0xD4);
2590         emitModRM(dst, src);
2591     }
2592 
2593     public final void pextrw(Register dst, Register src, int imm8) {
2594         assert inRC(CPU, dst) && inRC(XMM, src);
2595         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2596         emitByte(0xC5);
2597         emitModRM(dst, src);
2598         emitByte(imm8);
2599     }
2600 
2601     public final void pinsrw(Register dst, Register src, int imm8) {
2602         assert inRC(XMM, dst) && inRC(CPU, src);
2603         simdPrefix(dst, dst, src, PD, P_0F, false);
2604         emitByte(0xC4);
2605         emitModRM(dst, src);
2606         emitByte(imm8);
2607     }
2608 
2609     public final void por(Register dst, Register src) {
2610         assert inRC(XMM, dst) && inRC(XMM, src);
2611         simdPrefix(dst, dst, src, PD, P_0F, false);
2612         emitByte(0xEB);
2613         emitModRM(dst, src);
2614     }
2615 
2616     public final void pand(Register dst, Register src) {
2617         assert inRC(XMM, dst) && inRC(XMM, src);
2618         simdPrefix(dst, dst, src, PD, P_0F, false);
2619         emitByte(0xDB);
2620         emitModRM(dst, src);
2621     }
2622 
2623     public final void pxor(Register dst, Register src) {
2624         assert inRC(XMM, dst) && inRC(XMM, src);
2625         simdPrefix(dst, dst, src, PD, P_0F, false);
2626         emitByte(0xEF);
2627         emitModRM(dst, src);
2628     }
2629 
2630     public final void pslld(Register dst, int imm8) {
2631         assert isUByte(imm8) : "invalid value";
2632         assert inRC(XMM, dst);
2633         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2634         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2635         emitByte(0x72);
2636         emitModRM(6, dst);
2637         emitByte(imm8 & 0xFF);
2638     }
2639 
2640     public final void psllq(Register dst, Register shift) {
2641         assert inRC(XMM, dst) && inRC(XMM, shift);
2642         simdPrefix(dst, dst, shift, PD, P_0F, false);
2643         emitByte(0xF3);
2644         emitModRM(dst, shift);
2645     }
2646 
2647     public final void psllq(Register dst, int imm8) {
2648         assert isUByte(imm8) : "invalid value";
2649         assert inRC(XMM, dst);
2650         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2651         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2652         emitByte(0x73);
2653         emitModRM(6, dst);
2654         emitByte(imm8);
2655     }
2656 
2657     public final void psrad(Register dst, int imm8) {
2658         assert isUByte(imm8) : "invalid value";
2659         assert inRC(XMM, dst);
2660         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2661         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2662         emitByte(0x72);
2663         emitModRM(4, dst);
2664         emitByte(imm8);
2665     }
2666 
2667     public final void psrld(Register dst, int imm8) {
2668         assert isUByte(imm8) : "invalid value";
2669         assert inRC(XMM, dst);
2670         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2671         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2672         emitByte(0x72);
2673         emitModRM(2, dst);
2674         emitByte(imm8);
2675     }
2676 
2677     public final void psrlq(Register dst, int imm8) {
2678         assert isUByte(imm8) : "invalid value";
2679         assert inRC(XMM, dst);
2680         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2681         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2682         emitByte(0x73);
2683         emitModRM(2, dst);
2684         emitByte(imm8);
2685     }
2686 
2687     public final void psrldq(Register dst, int imm8) {
2688         assert isUByte(imm8) : "invalid value";
2689         assert inRC(XMM, dst);
2690         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2691         emitByte(0x73);
2692         emitModRM(3, dst);
2693         emitByte(imm8);
2694     }
2695 
2696     public final void pshufb(Register dst, Register src) {
2697         assert supports(CPUFeature.SSSE3);
2698         assert inRC(XMM, dst) && inRC(XMM, src);
2699         simdPrefix(dst, dst, src, PD, P_0F38, false);
2700         emitByte(0x00);
2701         emitModRM(dst, src);
2702     }
2703 
2704     public final void pshuflw(Register dst, Register src, int imm8) {
2705         assert supports(CPUFeature.SSE2);
2706         assert isUByte(imm8) : "invalid value";
2707         assert inRC(XMM, dst) && inRC(XMM, src);
2708         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2709         emitByte(0x70);
2710         emitModRM(dst, src);
2711         emitByte(imm8);
2712     }
2713 
2714     public final void pshufd(Register dst, Register src, int imm8) {
2715         assert isUByte(imm8) : "invalid value";
2716         assert inRC(XMM, dst) && inRC(XMM, src);
2717         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2718         emitByte(0x70);
2719         emitModRM(dst, src);
2720         emitByte(imm8);
2721     }
2722 
2723     public final void psubd(Register dst, Register src) {
2724         assert inRC(XMM, dst) && inRC(XMM, src);
2725         simdPrefix(dst, dst, src, PD, P_0F, false);
2726         emitByte(0xFA);
2727         emitModRM(dst, src);
2728     }
2729 
2730     public final void punpcklbw(Register dst, Register src) {
2731         assert supports(CPUFeature.SSE2);
2732         assert inRC(XMM, dst) && inRC(XMM, src);
2733         simdPrefix(dst, dst, src, PD, P_0F, false);
2734         emitByte(0x60);
2735         emitModRM(dst, src);
2736     }
2737 
2738     public final void rcpps(Register dst, Register src) {
2739         assert inRC(XMM, dst) && inRC(XMM, src);
2740         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2741         emitByte(0x53);
2742         emitModRM(dst, src);
2743     }
2744 
2745     public final void ret(int imm16) {
2746         if (imm16 == 0) {
2747             emitByte(0xC3);
2748         } else {
2749             emitByte(0xC2);
2750             emitShort(imm16);
2751         }
2752     }
2753 
2754     public final void sarl(Register dst, int imm8) {
2755         prefix(dst);
2756         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2757         if (imm8 == 1) {
2758             emitByte(0xD1);
2759             emitModRM(7, dst);
2760         } else {
2761             emitByte(0xC1);
2762             emitModRM(7, dst);
2763             emitByte(imm8);
2764         }
2765     }
2766 
2767     public final void shll(Register dst, int imm8) {
2768         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2769         prefix(dst);
2770         if (imm8 == 1) {
2771             emitByte(0xD1);
2772             emitModRM(4, dst);
2773         } else {
2774             emitByte(0xC1);
2775             emitModRM(4, dst);
2776             emitByte(imm8);
2777         }
2778     }
2779 
2780     public final void shll(Register dst) {
2781         // Multiply dst by 2, CL times.
2782         prefix(dst);
2783         emitByte(0xD3);
2784         emitModRM(4, dst);
2785     }
2786 
2787     // Insn: SHLX r32a, r/m32, r32b
2788 
2789     public final void shlxl(Register dst, Register src1, Register src2) {
2790         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2791     }
2792 
2793     public final void shrl(Register dst, int imm8) {
2794         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2795         prefix(dst);
2796         emitByte(0xC1);
2797         emitModRM(5, dst);
2798         emitByte(imm8);
2799     }
2800 
2801     public final void shrl(Register dst) {
2802         // Unsigned divide dst by 2, CL times.
2803         prefix(dst);
2804         emitByte(0xD3);
2805         emitModRM(5, dst);
2806     }
2807 
2808     public final void subl(AMD64Address dst, int imm32) {
2809         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2810     }
2811 
2812     public final void subl(Register dst, int imm32) {
2813         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2814     }
2815 
2816     public final void subl(Register dst, Register src) {
2817         SUB.rmOp.emit(this, DWORD, dst, src);
2818     }
2819 
2820     public final void subpd(Register dst, Register src) {
2821         SSEOp.SUB.emit(this, PD, dst, src);
2822     }
2823 
2824     public final void subsd(Register dst, Register src) {
2825         SSEOp.SUB.emit(this, SD, dst, src);
2826     }
2827 
2828     public final void subsd(Register dst, AMD64Address src) {
2829         SSEOp.SUB.emit(this, SD, dst, src);
2830     }
2831 
2832     public final void testl(Register dst, int imm32) {
2833         // not using emitArith because test
2834         // doesn't support sign-extension of
2835         // 8bit operands
2836         if (dst.encoding == 0) {
2837             emitByte(0xA9);
2838         } else {
2839             prefix(dst);
2840             emitByte(0xF7);
2841             emitModRM(0, dst);
2842         }
2843         emitInt(imm32);
2844     }
2845 
2846     public final void testl(Register dst, Register src) {
2847         prefix(dst, src);
2848         emitByte(0x85);
2849         emitModRM(dst, src);
2850     }
2851 
2852     public final void testl(Register dst, AMD64Address src) {
2853         prefix(src, dst);
2854         emitByte(0x85);
2855         emitOperandHelper(dst, src, 0);
2856     }
2857 
2858     public final void unpckhpd(Register dst, Register src) {
2859         assert inRC(XMM, dst) && inRC(XMM, src);
2860         simdPrefix(dst, dst, src, PD, P_0F, false);
2861         emitByte(0x15);
2862         emitModRM(dst, src);
2863     }
2864 
2865     public final void unpcklpd(Register dst, Register src) {
2866         assert inRC(XMM, dst) && inRC(XMM, src);
2867         simdPrefix(dst, dst, src, PD, P_0F, false);
2868         emitByte(0x14);
2869         emitModRM(dst, src);
2870     }
2871 
2872     public final void xorl(Register dst, Register src) {
2873         XOR.rmOp.emit(this, DWORD, dst, src);
2874     }
2875 
2876     public final void xorpd(Register dst, Register src) {
2877         SSEOp.XOR.emit(this, PD, dst, src);
2878     }
2879 
2880     public final void xorps(Register dst, Register src) {
2881         SSEOp.XOR.emit(this, PS, dst, src);
2882     }
2883 
2884     protected final void decl(Register dst) {
2885         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2886         prefix(dst);
2887         emitByte(0xFF);
2888         emitModRM(1, dst);
2889     }
2890 
2891     protected final void incl(Register dst) {
2892         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2893         prefix(dst);
2894         emitByte(0xFF);
2895         emitModRM(0, dst);
2896     }
2897 
2898     public final void addq(Register dst, int imm32) {
2899         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2900     }
2901 
2902     public final void addq(AMD64Address dst, int imm32) {
2903         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2904     }
2905 
2906     public final void addq(Register dst, Register src) {
2907         ADD.rmOp.emit(this, QWORD, dst, src);
2908     }
2909 
2910     public final void addq(AMD64Address dst, Register src) {
2911         ADD.mrOp.emit(this, QWORD, dst, src);
2912     }
2913 
2914     public final void andq(Register dst, int imm32) {
2915         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2916     }
2917 
2918     public final void bsrq(Register dst, Register src) {
2919         prefixq(dst, src);
2920         emitByte(0x0F);
2921         emitByte(0xBD);
2922         emitModRM(dst, src);
2923     }
2924 
2925     public final void bswapq(Register reg) {
2926         prefixq(reg);
2927         emitByte(0x0F);
2928         emitByte(0xC8 + encode(reg));
2929     }
2930 
2931     public final void cdqq() {
2932         rexw();
2933         emitByte(0x99);
2934     }
2935 
2936     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
2937         prefixq(dst, src);
2938         emitByte(0x0F);
2939         emitByte(0x40 | cc.getValue());
2940         emitModRM(dst, src);
2941     }
2942 
2943     public final void setb(ConditionFlag cc, Register dst) {
2944         prefix(dst, true);
2945         emitByte(0x0F);
2946         emitByte(0x90 | cc.getValue());
2947         emitModRM(0, dst);
2948     }
2949 
2950     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
2951         prefixq(src, dst);
2952         emitByte(0x0F);
2953         emitByte(0x40 | cc.getValue());
2954         emitOperandHelper(dst, src, 0);
2955     }
2956 
2957     public final void cmpq(Register dst, int imm32) {
2958         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2959     }
2960 
2961     public final void cmpq(Register dst, Register src) {
2962         CMP.rmOp.emit(this, QWORD, dst, src);
2963     }
2964 
2965     public final void cmpq(Register dst, AMD64Address src) {
2966         CMP.rmOp.emit(this, QWORD, dst, src);
2967     }
2968 
2969     public final void cmpxchgq(Register reg, AMD64Address adr) {
2970         prefixq(adr, reg);
2971         emitByte(0x0F);
2972         emitByte(0xB1);
2973         emitOperandHelper(reg, adr, 0);
2974     }
2975 
2976     public final void cvtdq2pd(Register dst, Register src) {
2977         assert inRC(XMM, dst) && inRC(XMM, src);
2978         simdPrefix(dst, Register.None, src, SS, P_0F, false);
2979         emitByte(0xE6);
2980         emitModRM(dst, src);
2981     }
2982 
2983     public final void cvtsi2sdq(Register dst, Register src) {
2984         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
2985     }
2986 
2987     public final void cvttsd2siq(Register dst, Register src) {
2988         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
2989     }
2990 
2991     public final void cvttpd2dq(Register dst, Register src) {
2992         assert inRC(XMM, dst) && inRC(XMM, src);
2993         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2994         emitByte(0xE6);
2995         emitModRM(dst, src);
2996     }
2997 
2998     public final void decq(Register dst) {
2999         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3000         prefixq(dst);
3001         emitByte(0xFF);
3002         emitModRM(1, dst);
3003     }
3004 
3005     public final void decq(AMD64Address dst) {
3006         DEC.emit(this, QWORD, dst);
3007     }
3008 
3009     public final void imulq(Register dst, Register src) {
3010         prefixq(dst, src);
3011         emitByte(0x0F);
3012         emitByte(0xAF);
3013         emitModRM(dst, src);
3014     }
3015 
3016     public final void incq(Register dst) {
3017         // Don't use it directly. Use Macroincrementq() instead.
3018         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3019         prefixq(dst);
3020         emitByte(0xFF);
3021         emitModRM(0, dst);
3022     }
3023 
3024     public final void incq(AMD64Address dst) {
3025         INC.emit(this, QWORD, dst);
3026     }
3027 
3028     public final void movq(Register dst, long imm64) {
3029         movq(dst, imm64, false);
3030     }
3031 
3032     public final void movq(Register dst, long imm64, boolean annotateImm) {
3033         int insnPos = position();
3034         prefixq(dst);
3035         emitByte(0xB8 + encode(dst));
3036         int immPos = position();
3037         emitLong(imm64);
3038         int nextInsnPos = position();
3039         if (annotateImm && codePatchingAnnotationConsumer != null) {
3040             codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3041         }
3042     }
3043 
3044     public final void movslq(Register dst, int imm32) {
3045         prefixq(dst);
3046         emitByte(0xC7);
3047         emitModRM(0, dst);
3048         emitInt(imm32);
3049     }
3050 
3051     public final void movdq(Register dst, AMD64Address src) {
3052         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3053     }
3054 
3055     public final void movdq(AMD64Address dst, Register src) {
3056         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3057     }
3058 
3059     public final void movdq(Register dst, Register src) {
3060         if (inRC(XMM, dst) && inRC(CPU, src)) {
3061             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3062         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3063             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3064         } else {
3065             throw new InternalError("should not reach here");
3066         }
3067     }
3068 
3069     public final void movdl(Register dst, Register src) {
3070         if (inRC(XMM, dst) && inRC(CPU, src)) {
3071             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3072         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3073             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3074         } else {
3075             throw new InternalError("should not reach here");
3076         }
3077     }
3078 
3079     public final void movdl(Register dst, AMD64Address src) {
3080         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3081     }
3082 
3083     public final void movddup(Register dst, Register src) {
3084         assert supports(CPUFeature.SSE3);
3085         assert inRC(XMM, dst) && inRC(XMM, src);
3086         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3087         emitByte(0x12);
3088         emitModRM(dst, src);
3089     }
3090 
3091     public final void movdqu(Register dst, AMD64Address src) {
3092         assert inRC(XMM, dst);
3093         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3094         emitByte(0x6F);
3095         emitOperandHelper(dst, src, 0);
3096     }
3097 
3098     public final void movdqu(Register dst, Register src) {
3099         assert inRC(XMM, dst) && inRC(XMM, src);
3100         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3101         emitByte(0x6F);
3102         emitModRM(dst, src);
3103     }
3104 
3105     // Insn: VMOVDQU xmm2/m128, xmm1
3106 
3107     public final void movdqu(AMD64Address dst, Register src) {
3108         assert inRC(XMM, src);
3109         // Code: VEX.128.F3.0F.WIG 7F /r
3110         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3111         emitByte(0x7F);
3112         emitOperandHelper(src, dst, 0);
3113     }
3114 
3115     public final void movslq(AMD64Address dst, int imm32) {
3116         prefixq(dst);
3117         emitByte(0xC7);
3118         emitOperandHelper(0, dst, 4);
3119         emitInt(imm32);
3120     }
3121 
3122     public final void movslq(Register dst, AMD64Address src) {
3123         prefixq(src, dst);
3124         emitByte(0x63);
3125         emitOperandHelper(dst, src, 0);
3126     }
3127 
3128     public final void movslq(Register dst, Register src) {
3129         prefixq(dst, src);
3130         emitByte(0x63);
3131         emitModRM(dst, src);
3132     }
3133 
3134     public final void negq(Register dst) {
3135         prefixq(dst);
3136         emitByte(0xF7);
3137         emitModRM(3, dst);
3138     }
3139 
3140     public final void orq(Register dst, Register src) {
3141         OR.rmOp.emit(this, QWORD, dst, src);
3142     }
3143 
3144     public final void shlq(Register dst, int imm8) {
3145         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3146         prefixq(dst);
3147         if (imm8 == 1) {
3148             emitByte(0xD1);
3149             emitModRM(4, dst);
3150         } else {
3151             emitByte(0xC1);
3152             emitModRM(4, dst);
3153             emitByte(imm8);
3154         }
3155     }
3156 
3157     public final void shlq(Register dst) {
3158         // Multiply dst by 2, CL times.
3159         prefixq(dst);
3160         emitByte(0xD3);
3161         emitModRM(4, dst);
3162     }
3163 
3164     public final void shrq(Register dst, int imm8) {
3165         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3166         prefixq(dst);
3167         if (imm8 == 1) {
3168             emitByte(0xD1);
3169             emitModRM(5, dst);
3170         } else {
3171             emitByte(0xC1);
3172             emitModRM(5, dst);
3173             emitByte(imm8);
3174         }
3175     }
3176 
3177     public final void shrq(Register dst) {
3178         prefixq(dst);
3179         emitByte(0xD3);
3180         // Unsigned divide dst by 2, CL times.
3181         emitModRM(5, dst);
3182     }
3183 
3184     public final void sbbq(Register dst, Register src) {
3185         SBB.rmOp.emit(this, QWORD, dst, src);
3186     }
3187 
3188     public final void subq(Register dst, int imm32) {
3189         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3190     }
3191 
3192     public final void subq(AMD64Address dst, int imm32) {
3193         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3194     }
3195 
3196     public final void subqWide(Register dst, int imm32) {
3197         // don't use the sign-extending version, forcing a 32-bit immediate
3198         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3199     }
3200 
3201     public final void subq(Register dst, Register src) {
3202         SUB.rmOp.emit(this, QWORD, dst, src);
3203     }
3204 
3205     public final void testq(Register dst, Register src) {
3206         prefixq(dst, src);
3207         emitByte(0x85);
3208         emitModRM(dst, src);
3209     }
3210 
3211     public final void btrq(Register src, int imm8) {
3212         prefixq(src);
3213         emitByte(0x0F);
3214         emitByte(0xBA);
3215         emitModRM(6, src);
3216         emitByte(imm8);
3217     }
3218 
3219     public final void xaddb(AMD64Address dst, Register src) {
3220         prefixb(dst, src);
3221         emitByte(0x0F);
3222         emitByte(0xC0);
3223         emitOperandHelper(src, dst, 0);
3224     }
3225 
3226     public final void xaddw(AMD64Address dst, Register src) {
3227         emitByte(0x66); // Switch to 16-bit mode.
3228         prefix(dst, src);
3229         emitByte(0x0F);
3230         emitByte(0xC1);
3231         emitOperandHelper(src, dst, 0);
3232     }
3233 
3234     public final void xaddl(AMD64Address dst, Register src) {
3235         prefix(dst, src);
3236         emitByte(0x0F);
3237         emitByte(0xC1);
3238         emitOperandHelper(src, dst, 0);
3239     }
3240 
3241     public final void xaddq(AMD64Address dst, Register src) {
3242         prefixq(dst, src);
3243         emitByte(0x0F);
3244         emitByte(0xC1);
3245         emitOperandHelper(src, dst, 0);
3246     }
3247 
3248     public final void xchgb(Register dst, AMD64Address src) {
3249         prefixb(src, dst);
3250         emitByte(0x86);
3251         emitOperandHelper(dst, src, 0);
3252     }
3253 
3254     public final void xchgw(Register dst, AMD64Address src) {
3255         emitByte(0x66);
3256         prefix(src, dst);
3257         emitByte(0x87);
3258         emitOperandHelper(dst, src, 0);
3259     }
3260 
3261     public final void xchgl(Register dst, AMD64Address src) {
3262         prefix(src, dst);
3263         emitByte(0x87);
3264         emitOperandHelper(dst, src, 0);
3265     }
3266 
3267     public final void xchgq(Register dst, AMD64Address src) {
3268         prefixq(src, dst);
3269         emitByte(0x87);
3270         emitOperandHelper(dst, src, 0);
3271     }
3272 
3273     public final void membar(int barriers) {
3274         if (target.isMP) {
3275             // We only have to handle StoreLoad
3276             if ((barriers & STORE_LOAD) != 0) {
3277                 // All usable chips support "locked" instructions which suffice
3278                 // as barriers, and are much faster than the alternative of
3279                 // using cpuid instruction. We use here a locked add [rsp],0.
3280                 // This is conveniently otherwise a no-op except for blowing
3281                 // flags.
3282                 // Any change to this code may need to revisit other places in
3283                 // the code where this idiom is used, in particular the
3284                 // orderAccess code.
3285                 lock();
3286                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3287             }
3288         }
3289     }
3290 
3291     @Override
3292     protected final void patchJumpTarget(int branch, int branchTarget) {
3293         int op = getByte(branch);
3294         assert op == 0xE8 // call
3295                         || op == 0x00 // jump table entry
3296                         || op == 0xE9 // jmp
3297                         || op == 0xEB // short jmp
3298                         || (op & 0xF0) == 0x70 // short jcc
3299                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3300         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3301 
3302         if (op == 0x00) {
3303             int offsetToJumpTableBase = getShort(branch + 1);
3304             int jumpTableBase = branch - offsetToJumpTableBase;
3305             int imm32 = branchTarget - jumpTableBase;
3306             emitInt(imm32, branch);
3307         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3308 
3309             // short offset operators (jmp and jcc)
3310             final int imm8 = branchTarget - (branch + 2);
3311             /*
3312              * Since a wrongly patched short branch can potentially lead to working but really bad
3313              * behaving code we should always fail with an exception instead of having an assert.
3314              */
3315             if (!NumUtil.isByte(imm8)) {
3316                 throw new InternalError("branch displacement out of range: " + imm8);
3317             }
3318             emitByte(imm8, branch + 1);
3319 
3320         } else {
3321 
3322             int off = 1;
3323             if (op == 0x0F) {
3324                 off = 2;
3325             }
3326 
3327             int imm32 = branchTarget - (branch + 4 + off);
3328             emitInt(imm32, branch + off);
3329         }
3330     }
3331 
3332     public void nullCheck(AMD64Address address) {
3333         testl(AMD64.rax, address);
3334     }
3335 
3336     @Override
3337     public void align(int modulus) {
3338         if (position() % modulus != 0) {
3339             nop(modulus - (position() % modulus));
3340         }
3341     }
3342 
3343     /**
3344      * Emits a direct call instruction. Note that the actual call target is not specified, because
3345      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3346      * responsible to add the call address to the appropriate patching tables.
3347      */
3348     public final void call() {
3349         annotatePatchingImmediate(1, 4);
3350         emitByte(0xE8);
3351         emitInt(0);
3352     }
3353 
3354     public final void call(Register src) {
3355         prefix(src);
3356         emitByte(0xFF);
3357         emitModRM(2, src);
3358     }
3359 
3360     public final void int3() {
3361         emitByte(0xCC);
3362     }
3363 
3364     public final void pause() {
3365         emitByte(0xF3);
3366         emitByte(0x90);
3367     }
3368 
3369     private void emitx87(int b1, int b2, int i) {
3370         assert 0 <= i && i < 8 : "illegal stack offset";
3371         emitByte(b1);
3372         emitByte(b2 + i);
3373     }
3374 
3375     public final void fldd(AMD64Address src) {
3376         emitByte(0xDD);
3377         emitOperandHelper(0, src, 0);
3378     }
3379 
3380     public final void flds(AMD64Address src) {
3381         emitByte(0xD9);
3382         emitOperandHelper(0, src, 0);
3383     }
3384 
3385     public final void fldln2() {
3386         emitByte(0xD9);
3387         emitByte(0xED);
3388     }
3389 
3390     public final void fldlg2() {
3391         emitByte(0xD9);
3392         emitByte(0xEC);
3393     }
3394 
3395     public final void fyl2x() {
3396         emitByte(0xD9);
3397         emitByte(0xF1);
3398     }
3399 
3400     public final void fstps(AMD64Address src) {
3401         emitByte(0xD9);
3402         emitOperandHelper(3, src, 0);
3403     }
3404 
3405     public final void fstpd(AMD64Address src) {
3406         emitByte(0xDD);
3407         emitOperandHelper(3, src, 0);
3408     }
3409 
3410     private void emitFPUArith(int b1, int b2, int i) {
3411         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3412         emitByte(b1);
3413         emitByte(b2 + i);
3414     }
3415 
3416     public void ffree(int i) {
3417         emitFPUArith(0xDD, 0xC0, i);
3418     }
3419 
3420     public void fincstp() {
3421         emitByte(0xD9);
3422         emitByte(0xF7);
3423     }
3424 
3425     public void fxch(int i) {
3426         emitFPUArith(0xD9, 0xC8, i);
3427     }
3428 
3429     public void fnstswAX() {
3430         emitByte(0xDF);
3431         emitByte(0xE0);
3432     }
3433 
3434     public void fwait() {
3435         emitByte(0x9B);
3436     }
3437 
3438     public void fprem() {
3439         emitByte(0xD9);
3440         emitByte(0xF8);
3441     }
3442 
3443     public final void fsin() {
3444         emitByte(0xD9);
3445         emitByte(0xFE);
3446     }
3447 
3448     public final void fcos() {
3449         emitByte(0xD9);
3450         emitByte(0xFF);
3451     }
3452 
3453     public final void fptan() {
3454         emitByte(0xD9);
3455         emitByte(0xF2);
3456     }
3457 
3458     public final void fstp(int i) {
3459         emitx87(0xDD, 0xD8, i);
3460     }
3461 
3462     @Override
3463     public AMD64Address makeAddress(Register base, int displacement) {
3464         return new AMD64Address(base, displacement);
3465     }
3466 
3467     @Override
3468     public AMD64Address getPlaceholder(int instructionStartPosition) {
3469         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3470     }
3471 
3472     private void prefetchPrefix(AMD64Address src) {
3473         prefix(src);
3474         emitByte(0x0F);
3475     }
3476 
3477     public void prefetchnta(AMD64Address src) {
3478         prefetchPrefix(src);
3479         emitByte(0x18);
3480         emitOperandHelper(0, src, 0);
3481     }
3482 
3483     void prefetchr(AMD64Address src) {
3484         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3485         prefetchPrefix(src);
3486         emitByte(0x0D);
3487         emitOperandHelper(0, src, 0);
3488     }
3489 
3490     public void prefetcht0(AMD64Address src) {
3491         assert supports(CPUFeature.SSE);
3492         prefetchPrefix(src);
3493         emitByte(0x18);
3494         emitOperandHelper(1, src, 0);
3495     }
3496 
3497     public void prefetcht1(AMD64Address src) {
3498         assert supports(CPUFeature.SSE);
3499         prefetchPrefix(src);
3500         emitByte(0x18);
3501         emitOperandHelper(2, src, 0);
3502     }
3503 
3504     public void prefetcht2(AMD64Address src) {
3505         assert supports(CPUFeature.SSE);
3506         prefix(src);
3507         emitByte(0x0f);
3508         emitByte(0x18);
3509         emitOperandHelper(3, src, 0);
3510     }
3511 
3512     public void prefetchw(AMD64Address src) {
3513         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3514         prefix(src);
3515         emitByte(0x0f);
3516         emitByte(0x0D);
3517         emitOperandHelper(1, src, 0);
3518     }
3519 
3520     public void rdtsc() {
3521         emitByte(0x0F);
3522         emitByte(0x31);
3523     }
3524 
3525     /**
3526      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3527      * to crash the program (debugging etc.).
3528      */
3529     public void illegal() {
3530         emitByte(0x0f);
3531         emitByte(0x0b);
3532     }
3533 
3534     public void lfence() {
3535         emitByte(0x0f);
3536         emitByte(0xae);
3537         emitByte(0xe8);
3538     }
3539 
3540     public final void vptest(Register dst, Register src) {
3541         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3542     }
3543 
3544     public final void vpxor(Register dst, Register nds, Register src) {
3545         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3546     }
3547 
3548     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3549         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3550     }
3551 
3552     public final void vmovdqu(Register dst, AMD64Address src) {
3553         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3554     }
3555 
3556     public final void vmovdqu(AMD64Address dst, Register src) {
3557         assert inRC(XMM, src);
3558         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3559     }
3560 
3561     public final void vpmovzxbw(Register dst, AMD64Address src) {
3562         assert supports(CPUFeature.AVX2);
3563         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3564     }
3565 
3566     public final void vzeroupper() {
3567         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3568         emitByte(0x77);
3569     }
3570 
3571     // Insn: KORTESTD k1, k2
3572 
3573     // This instruction produces ZF or CF flags
3574     public final void kortestd(Register src1, Register src2) {
3575         assert supports(CPUFeature.AVX512BW);
3576         assert inRC(MASK, src1) && inRC(MASK, src2);
3577         // Code: VEX.L0.66.0F.W1 98 /r
3578         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3579         emitByte(0x98);
3580         emitModRM(src1, src2);
3581     }
3582 
3583     // Insn: KORTESTQ k1, k2
3584 
3585     // This instruction produces ZF or CF flags
3586     public final void kortestq(Register src1, Register src2) {
3587         assert supports(CPUFeature.AVX512BW);
3588         assert inRC(MASK, src1) && inRC(MASK, src2);
3589         // Code: VEX.L0.0F.W1 98 /r
3590         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3591         emitByte(0x98);
3592         emitModRM(src1, src2);
3593     }
3594 
3595     public final void kmovd(Register dst, Register src) {
3596         assert supports(CPUFeature.AVX512BW);
3597         assert inRC(MASK, dst) || inRC(CPU, dst);
3598         assert inRC(MASK, src) || inRC(CPU, src);
3599         assert !(inRC(CPU, dst) && inRC(CPU, src));
3600 
3601         if (inRC(MASK, dst)) {
3602             if (inRC(MASK, src)) {
3603                 // kmovd(KRegister dst, KRegister src):
3604                 // Insn: KMOVD k1, k2/m32
3605                 // Code: VEX.L0.66.0F.W1 90 /r
3606                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3607                 emitByte(0x90);
3608                 emitModRM(dst, src);
3609             } else {
3610                 // kmovd(KRegister dst, Register src)
3611                 // Insn: KMOVD k1, r32
3612                 // Code: VEX.L0.F2.0F.W0 92 /r
3613                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3614                 emitByte(0x92);
3615                 emitModRM(dst, src);
3616             }
3617         } else {
3618             if (inRC(MASK, src)) {
3619                 // kmovd(Register dst, KRegister src)
3620                 // Insn: KMOVD r32, k1
3621                 // Code: VEX.L0.F2.0F.W0 93 /r
3622                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3623                 emitByte(0x93);
3624                 emitModRM(dst, src);
3625             } else {
3626                 throw GraalError.shouldNotReachHere();
3627             }
3628         }
3629     }
3630 
3631     public final void kmovq(Register dst, Register src) {
3632         assert supports(CPUFeature.AVX512BW);
3633         assert inRC(MASK, dst) || inRC(CPU, dst);
3634         assert inRC(MASK, src) || inRC(CPU, src);
3635         assert !(inRC(CPU, dst) && inRC(CPU, src));
3636 
3637         if (inRC(MASK, dst)) {
3638             if (inRC(MASK, src)) {
3639                 // kmovq(KRegister dst, KRegister src):
3640                 // Insn: KMOVQ k1, k2/m64
3641                 // Code: VEX.L0.0F.W1 90 /r
3642                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3643                 emitByte(0x90);
3644                 emitModRM(dst, src);
3645             } else {
3646                 // kmovq(KRegister dst, Register src)
3647                 // Insn: KMOVQ k1, r64
3648                 // Code: VEX.L0.F2.0F.W1 92 /r
3649                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3650                 emitByte(0x92);
3651                 emitModRM(dst, src);
3652             }
3653         } else {
3654             if (inRC(MASK, src)) {
3655                 // kmovq(Register dst, KRegister src)
3656                 // Insn: KMOVQ r64, k1
3657                 // Code: VEX.L0.F2.0F.W1 93 /r
3658                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3659                 emitByte(0x93);
3660                 emitModRM(dst, src);
3661             } else {
3662                 throw GraalError.shouldNotReachHere();
3663             }
3664         }
3665     }
3666 
3667     // Insn: KTESTD k1, k2
3668 
3669     public final void ktestd(Register src1, Register src2) {
3670         assert supports(CPUFeature.AVX512BW);
3671         assert inRC(MASK, src1) && inRC(MASK, src2);
3672         // Code: VEX.L0.66.0F.W1 99 /r
3673         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3674         emitByte(0x99);
3675         emitModRM(src1, src2);
3676     }
3677 
3678     public final void evmovdqu64(Register dst, AMD64Address src) {
3679         assert supports(CPUFeature.AVX512F);
3680         assert inRC(XMM, dst);
3681         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3682         emitByte(0x6F);
3683         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3684     }
3685 
3686     // Insn: VPMOVZXBW zmm1, m256
3687 
3688     public final void evpmovzxbw(Register dst, AMD64Address src) {
3689         assert supports(CPUFeature.AVX512BW);
3690         assert inRC(XMM, dst);
3691         // Code: EVEX.512.66.0F38.WIG 30 /r
3692         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3693         emitByte(0x30);
3694         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3695     }
3696 
3697     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3698         assert supports(CPUFeature.AVX512BW);
3699         assert inRC(MASK, kdst) && inRC(XMM, nds);
3700         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3701         emitByte(0x74);
3702         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3703     }
3704 
3705     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3706     // -----
3707     // Insn: VMOVDQU16 zmm1, m512
3708 
3709     public final void evmovdqu16(Register dst, AMD64Address src) {
3710         assert supports(CPUFeature.AVX512BW);
3711         assert inRC(XMM, dst);
3712         // Code: EVEX.512.F2.0F.W1 6F /r
3713         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3714         emitByte(0x6F);
3715         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3716     }
3717 
3718     // Insn: VMOVDQU16 zmm1, k1:z, m512
3719 
3720     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3721         assert supports(CPUFeature.AVX512BW);
3722         assert inRC(XMM, dst) && inRC(MASK, mask);
3723         // Code: EVEX.512.F2.0F.W1 6F /r
3724         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3725         emitByte(0x6F);
3726         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3727     }
3728 
3729     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3730     // -----
3731     // Insn: VMOVDQU16 m512, zmm1
3732 
3733     public final void evmovdqu16(AMD64Address dst, Register src) {
3734         assert supports(CPUFeature.AVX512BW);
3735         assert inRC(XMM, src);
3736         // Code: EVEX.512.F2.0F.W1 7F /r
3737         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3738         emitByte(0x7F);
3739         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3740     }
3741 
3742     // Insn: VMOVDQU16 m512, k1, zmm1
3743 
3744     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3745         assert supports(CPUFeature.AVX512BW);
3746         assert inRC(MASK, mask) && inRC(XMM, src);
3747         // Code: EVEX.512.F2.0F.W1 7F /r
3748         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3749         emitByte(0x7F);
3750         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3751     }
3752 
3753     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3754     // -----
3755     // Insn: VPBROADCASTW zmm1, reg
3756 
3757     public final void evpbroadcastw(Register dst, Register src) {
3758         assert supports(CPUFeature.AVX512BW);
3759         assert inRC(XMM, dst) && inRC(CPU, src);
3760         // Code: EVEX.512.66.0F38.W0 7B /r
3761         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3762         emitByte(0x7B);
3763         emitModRM(dst, src);
3764     }
3765 
3766     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3767     // -----
3768     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3769 
3770     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3771         assert supports(CPUFeature.AVX512BW);
3772         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3773         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3774         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3775         emitByte(0x3E);
3776         emitModRM(kdst, src);
3777         emitByte(vcc);
3778     }
3779 
3780     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3781     // -----
3782     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3783 
3784     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3785         assert supports(CPUFeature.AVX512BW);
3786         assert inRC(MASK, kdst) && inRC(MASK, mask);
3787         assert inRC(XMM, nds) && inRC(XMM, src);
3788         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3789         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3790         emitByte(0x3E);
3791         emitModRM(kdst, src);
3792         emitByte(vcc);
3793     }
3794 
3795     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3796     // -----
3797     // Insn: VPMOVWB m256, zmm2
3798 
3799     public final void evpmovwb(AMD64Address dst, Register src) {
3800         assert supports(CPUFeature.AVX512BW);
3801         assert inRC(XMM, src);
3802         // Code: EVEX.512.F3.0F38.W0 30 /r
3803         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3804         emitByte(0x30);
3805         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3806     }
3807 
3808     // Insn: VPMOVWB m256, k1, zmm2
3809 
3810     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
3811         assert supports(CPUFeature.AVX512BW);
3812         assert inRC(MASK, mask) && inRC(XMM, src);
3813         // Code: EVEX.512.F3.0F38.W0 30 /r
3814         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3815         emitByte(0x30);
3816         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3817     }
3818 
3819     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
3820     // -----
3821     // Insn: VPMOVZXBW zmm1, k1, m256
3822 
3823     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
3824         assert supports(CPUFeature.AVX512BW);
3825         assert inRC(MASK, mask) && inRC(XMM, dst);
3826         // Code: EVEX.512.66.0F38.WIG 30 /r
3827         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3828         emitByte(0x30);
3829         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3830     }
3831 
3832 }