1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  68 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  69 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  70 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  71 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  72 
  73 import java.util.EnumSet;
  74 
  75 import org.graalvm.compiler.asm.Label;
  76 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  77 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  78 import org.graalvm.compiler.core.common.NumUtil;
  79 import org.graalvm.compiler.core.common.calc.Condition;
  80 import org.graalvm.compiler.debug.GraalError;
  81 
  82 import jdk.vm.ci.amd64.AMD64;
  83 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  84 import jdk.vm.ci.code.Register;
  85 import jdk.vm.ci.code.Register.RegisterCategory;
  86 import jdk.vm.ci.code.TargetDescription;
  87 
  88 /**
  89  * This class implements an assembler that can encode most X86 instructions.
  90  */
  91 public class AMD64Assembler extends AMD64BaseAssembler {
  92 
  93     /**
  94      * Constructs an assembler for the AMD64 architecture.
  95      */
  96     public AMD64Assembler(TargetDescription target) {
  97         super(target);
  98     }
  99 
 100     /**
 101      * The x86 condition codes used for conditional jumps/moves.
 102      */
 103     public enum ConditionFlag {
 104         Zero(0x4, "|zero|"),
 105         NotZero(0x5, "|nzero|"),
 106         Equal(0x4, "="),
 107         NotEqual(0x5, "!="),
 108         Less(0xc, "<"),
 109         LessEqual(0xe, "<="),
 110         Greater(0xf, ">"),
 111         GreaterEqual(0xd, ">="),
 112         Below(0x2, "|<|"),
 113         BelowEqual(0x6, "|<=|"),
 114         Above(0x7, "|>|"),
 115         AboveEqual(0x3, "|>=|"),
 116         Overflow(0x0, "|of|"),
 117         NoOverflow(0x1, "|nof|"),
 118         CarrySet(0x2, "|carry|"),
 119         CarryClear(0x3, "|ncarry|"),
 120         Negative(0x8, "|neg|"),
 121         Positive(0x9, "|pos|"),
 122         Parity(0xa, "|par|"),
 123         NoParity(0xb, "|npar|");
 124 
 125         private final int value;
 126         private final String operator;
 127 
 128         ConditionFlag(int value, String operator) {
 129             this.value = value;
 130             this.operator = operator;
 131         }
 132 
 133         public ConditionFlag negate() {
 134             switch (this) {
 135                 case Zero:
 136                     return NotZero;
 137                 case NotZero:
 138                     return Zero;
 139                 case Equal:
 140                     return NotEqual;
 141                 case NotEqual:
 142                     return Equal;
 143                 case Less:
 144                     return GreaterEqual;
 145                 case LessEqual:
 146                     return Greater;
 147                 case Greater:
 148                     return LessEqual;
 149                 case GreaterEqual:
 150                     return Less;
 151                 case Below:
 152                     return AboveEqual;
 153                 case BelowEqual:
 154                     return Above;
 155                 case Above:
 156                     return BelowEqual;
 157                 case AboveEqual:
 158                     return Below;
 159                 case Overflow:
 160                     return NoOverflow;
 161                 case NoOverflow:
 162                     return Overflow;
 163                 case CarrySet:
 164                     return CarryClear;
 165                 case CarryClear:
 166                     return CarrySet;
 167                 case Negative:
 168                     return Positive;
 169                 case Positive:
 170                     return Negative;
 171                 case Parity:
 172                     return NoParity;
 173                 case NoParity:
 174                     return Parity;
 175             }
 176             throw new IllegalArgumentException();
 177         }
 178 
 179         public int getValue() {
 180             return value;
 181         }
 182 
 183         @Override
 184         public String toString() {
 185             return operator;
 186         }
 187     }
 188 
 189     /**
 190      * Operand size and register type constraints.
 191      */
 192     private enum OpAssertion {
 193         ByteAssertion(CPU, CPU, BYTE),
 194         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 195         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 196         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 197         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 198         QwordAssertion(CPU, CPU, QWORD),
 199         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 200         PackedFloatAssertion(XMM, XMM, PS, PD),
 201         SingleAssertion(XMM, XMM, SS),
 202         DoubleAssertion(XMM, XMM, SD),
 203         PackedDoubleAssertion(XMM, XMM, PD),
 204         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 205         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 206 
 207         private final RegisterCategory resultCategory;
 208         private final RegisterCategory inputCategory;
 209         private final OperandSize[] allowedSizes;
 210 
 211         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 212             this.resultCategory = resultCategory;
 213             this.inputCategory = inputCategory;
 214             this.allowedSizes = allowedSizes;
 215         }
 216 
 217         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 218             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 219             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 220 
 221             for (OperandSize s : allowedSizes) {
 222                 if (size == s) {
 223                     return true;
 224                 }
 225             }
 226 
 227             assert false : "invalid operand size " + size + " used in " + op;
 228             return false;
 229         }
 230 
 231     }
 232 
 233     protected static final int P_0F = 0x0F;
 234     protected static final int P_0F38 = 0x380F;
 235     protected static final int P_0F3A = 0x3A0F;
 236 
 237     /**
 238      * Base class for AMD64 opcodes.
 239      */
 240     public static class AMD64Op {
 241 
 242         private final String opcode;
 243 
 244         protected final int prefix1;
 245         protected final int prefix2;
 246         protected final int op;
 247 
 248         private final boolean dstIsByte;
 249         private final boolean srcIsByte;
 250 
 251         private final OpAssertion assertion;
 252         private final CPUFeature feature;
 253 
 254         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 255             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 256         }
 257 
 258         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 259             this.opcode = opcode;
 260             this.prefix1 = prefix1;
 261             this.prefix2 = prefix2;
 262             this.op = op;
 263 
 264             this.dstIsByte = dstIsByte;
 265             this.srcIsByte = srcIsByte;
 266 
 267             this.assertion = assertion;
 268             this.feature = feature;
 269         }
 270 
 271         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 272             if (prefix1 != 0) {
 273                 asm.emitByte(prefix1);
 274             }
 275             if (size.getSizePrefix() != 0) {
 276                 asm.emitByte(size.getSizePrefix());
 277             }
 278             int rexPrefix = 0x40 | rxb;
 279             if (size == QWORD) {
 280                 rexPrefix |= 0x08;
 281             }
 282             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 283                 asm.emitByte(rexPrefix);
 284             }
 285             if (prefix2 > 0xFF) {
 286                 asm.emitShort(prefix2);
 287             } else if (prefix2 > 0) {
 288                 asm.emitByte(prefix2);
 289             }
 290             asm.emitByte(op);
 291         }
 292 
 293         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 294             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 295             assert assertion.checkOperands(this, size, resultReg, inputReg);
 296             return true;
 297         }
 298 
 299         public OperandSize[] getAllowedSizes() {
 300             return assertion.allowedSizes;
 301         }
 302 
 303         protected final boolean isSSEInstruction() {
 304             if (feature == null) {
 305                 return false;
 306             }
 307             switch (feature) {
 308                 case SSE:
 309                 case SSE2:
 310                 case SSE3:
 311                 case SSSE3:
 312                 case SSE4A:
 313                 case SSE4_1:
 314                 case SSE4_2:
 315                     return true;
 316                 default:
 317                     return false;
 318             }
 319         }
 320 
 321         public final OpAssertion getAssertion() {
 322             return assertion;
 323         }
 324 
 325         @Override
 326         public String toString() {
 327             return opcode;
 328         }
 329     }
 330 
 331     /**
 332      * Base class for AMD64 opcodes with immediate operands.
 333      */
 334     public static class AMD64ImmOp extends AMD64Op {
 335 
 336         private final boolean immIsByte;
 337 
 338         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 339             this(opcode, immIsByte, prefix, op, assertion, null);
 340         }
 341 
 342         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 343             super(opcode, 0, prefix, op, assertion, feature);
 344             this.immIsByte = immIsByte;
 345         }
 346 
 347         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 348             if (immIsByte) {
 349                 assert imm == (byte) imm;
 350                 asm.emitByte(imm);
 351             } else {
 352                 size.emitImmediate(asm, imm);
 353             }
 354         }
 355 
 356         protected final int immediateSize(OperandSize size) {
 357             if (immIsByte) {
 358                 return 1;
 359             } else {
 360                 return size.getBytes();
 361             }
 362         }
 363     }
 364 
 365     /**
 366      * Opcode with operand order of either RM or MR for 2 address forms.
 367      */
 368     public abstract static class AMD64RROp extends AMD64Op {
 369 
 370         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 371             super(opcode, prefix1, prefix2, op, assertion, feature);
 372         }
 373 
 374         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 375             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 376         }
 377 
 378         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 379     }
 380 
 381     /**
 382      * Opcode with operand order of RM.
 383      */
 384     public static class AMD64RMOp extends AMD64RROp {
 385         // @formatter:off
 386         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 387         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 388         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 389         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 390         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 391         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 392         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 393         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 394         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 395         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 396         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 399         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 400         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 401         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 402         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 403 
 404         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 405         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 407         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 409 
 410         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 411         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 412         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 413         // @formatter:on
 414 
 415         protected AMD64RMOp(String opcode, int op) {
 416             this(opcode, 0, op);
 417         }
 418 
 419         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 420             this(opcode, 0, op, assertion);
 421         }
 422 
 423         protected AMD64RMOp(String opcode, int prefix, int op) {
 424             this(opcode, 0, prefix, op, null);
 425         }
 426 
 427         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 428             this(opcode, 0, prefix, op, assertion, null);
 429         }
 430 
 431         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 432             this(opcode, 0, prefix, op, assertion, feature);
 433         }
 434 
 435         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 436             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 437         }
 438 
 439         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 440             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 441         }
 442 
 443         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 444             super(opcode, prefix1, prefix2, op, assertion, feature);
 445         }
 446 
 447         @Override
 448         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 449             assert verify(asm, size, dst, src);
 450             if (isSSEInstruction()) {
 451                 Register nds = Register.None;
 452                 switch (op) {
 453                     case 0x10:
 454                     case 0x51:
 455                         if ((size == SS) || (size == SD)) {
 456                             nds = dst;
 457                         }
 458                         break;
 459                     case 0x2A:
 460                     case 0x54:
 461                     case 0x55:
 462                     case 0x56:
 463                     case 0x57:
 464                     case 0x58:
 465                     case 0x59:
 466                     case 0x5A:
 467                     case 0x5C:
 468                     case 0x5D:
 469                     case 0x5E:
 470                     case 0x5F:
 471                         nds = dst;
 472                         break;
 473                     default:
 474                         break;
 475                 }
 476                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 477                 asm.emitByte(op);
 478                 asm.emitModRM(dst, src);
 479             } else {
 480                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 481                 asm.emitModRM(dst, src);
 482             }
 483         }
 484 
 485         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 486             assert verify(asm, size, dst, null);
 487             if (isSSEInstruction()) {
 488                 Register nds = Register.None;
 489                 switch (op) {
 490                     case 0x51:
 491                         if ((size == SS) || (size == SD)) {
 492                             nds = dst;
 493                         }
 494                         break;
 495                     case 0x2A:
 496                     case 0x54:
 497                     case 0x55:
 498                     case 0x56:
 499                     case 0x57:
 500                     case 0x58:
 501                     case 0x59:
 502                     case 0x5A:
 503                     case 0x5C:
 504                     case 0x5D:
 505                     case 0x5E:
 506                     case 0x5F:
 507                         nds = dst;
 508                         break;
 509                     default:
 510                         break;
 511                 }
 512                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 513                 asm.emitByte(op);
 514                 asm.emitOperandHelper(dst, src, 0);
 515             } else {
 516                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 517                 asm.emitOperandHelper(dst, src, 0);
 518             }
 519         }
 520     }
 521 
 522     /**
 523      * Opcode with operand order of MR.
 524      */
 525     public static class AMD64MROp extends AMD64RROp {
 526         // @formatter:off
 527         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 528         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 529 
 530         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 531         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 532         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 534 
 535         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 536         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 538         // @formatter:on
 539 
 540         protected AMD64MROp(String opcode, int op) {
 541             this(opcode, 0, op);
 542         }
 543 
 544         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 545             this(opcode, 0, op, assertion);
 546         }
 547 
 548         protected AMD64MROp(String opcode, int prefix, int op) {
 549             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 550         }
 551 
 552         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 553             this(opcode, prefix, op, assertion, null);
 554         }
 555 
 556         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 557             this(opcode, 0, prefix, op, assertion, feature);
 558         }
 559 
 560         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 561             super(opcode, prefix1, prefix2, op, assertion, feature);
 562         }
 563 
 564         @Override
 565         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 566             assert verify(asm, size, src, dst);
 567             if (isSSEInstruction()) {
 568                 Register nds = Register.None;
 569                 switch (op) {
 570                     case 0x11:
 571                         if ((size == SS) || (size == SD)) {
 572                             nds = src;
 573                         }
 574                         break;
 575                     default:
 576                         break;
 577                 }
 578                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 579                 asm.emitByte(op);
 580                 asm.emitModRM(src, dst);
 581             } else {
 582                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 583                 asm.emitModRM(src, dst);
 584             }
 585         }
 586 
 587         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 588             assert verify(asm, size, src, null);
 589             if (isSSEInstruction()) {
 590                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 591                 asm.emitByte(op);
 592             } else {
 593                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 594             }
 595             asm.emitOperandHelper(src, dst, 0);
 596         }
 597     }
 598 
 599     /**
 600      * Opcodes with operand order of M.
 601      */
 602     public static class AMD64MOp extends AMD64Op {
 603         // @formatter:off
 604         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 605         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 606         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 607         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 608         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 609         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 610         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 611         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 612         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 613         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 614         // @formatter:on
 615 
 616         private final int ext;
 617 
 618         protected AMD64MOp(String opcode, int op, int ext) {
 619             this(opcode, 0, op, ext);
 620         }
 621 
 622         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 623             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 624         }
 625 
 626         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 627             this(opcode, 0, op, ext, assertion);
 628         }
 629 
 630         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 631             super(opcode, 0, prefix, op, assertion, null);
 632             this.ext = ext;
 633         }
 634 
 635         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 636             assert verify(asm, size, dst, null);
 637             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 638             asm.emitModRM(ext, dst);
 639         }
 640 
 641         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 642             assert verify(asm, size, null, null);
 643             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 644             asm.emitOperandHelper(ext, dst, 0);
 645         }
 646     }
 647 
 648     /**
 649      * Opcodes with operand order of MI.
 650      */
 651     public static class AMD64MIOp extends AMD64ImmOp {
 652         // @formatter:off
 653         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 654         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 655         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 656         // @formatter:on
 657 
 658         private final int ext;
 659 
 660         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 661             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 662         }
 663 
 664         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 665             this(opcode, immIsByte, 0, op, ext, assertion);
 666         }
 667 
 668         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 669             super(opcode, immIsByte, prefix, op, assertion);
 670             this.ext = ext;
 671         }
 672 
 673         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 674             emit(asm, size, dst, imm, false);
 675         }
 676 
 677         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 678             assert verify(asm, size, dst, null);
 679             int insnPos = asm.position();
 680             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 681             asm.emitModRM(ext, dst);
 682             int immPos = asm.position();
 683             emitImmediate(asm, size, imm);
 684             int nextInsnPos = asm.position();
 685             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 686                 asm.codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 687             }
 688         }
 689 
 690         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 691             emit(asm, size, dst, imm, false);
 692         }
 693 
 694         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 695             assert verify(asm, size, null, null);
 696             int insnPos = asm.position();
 697             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 698             asm.emitOperandHelper(ext, dst, immediateSize(size));
 699             int immPos = asm.position();
 700             emitImmediate(asm, size, imm);
 701             int nextInsnPos = asm.position();
 702             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 703                 asm.codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 704             }
 705         }
 706     }
 707 
 708     /**
 709      * Opcodes with operand order of RMI.
 710      *
 711      * We only have one form of round as the operation is always treated with single variant input,
 712      * making its extension to 3 address forms redundant.
 713      */
 714     public static class AMD64RMIOp extends AMD64ImmOp {
 715         // @formatter:off
 716         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 717         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 718         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 720         // @formatter:on
 721 
 722         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 723             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 724         }
 725 
 726         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 727             super(opcode, immIsByte, prefix, op, assertion, feature);
 728         }
 729 
 730         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 731             assert verify(asm, size, dst, src);
 732             if (isSSEInstruction()) {
 733                 Register nds = Register.None;
 734                 switch (op) {
 735                     case 0x0A:
 736                     case 0x0B:
 737                         nds = dst;
 738                         break;
 739                     default:
 740                         break;
 741                 }
 742                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 743                 asm.emitByte(op);
 744                 asm.emitModRM(dst, src);
 745             } else {
 746                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 747                 asm.emitModRM(dst, src);
 748             }
 749             emitImmediate(asm, size, imm);
 750         }
 751 
 752         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 753             assert verify(asm, size, dst, null);
 754             if (isSSEInstruction()) {
 755                 Register nds = Register.None;
 756                 switch (op) {
 757                     case 0x0A:
 758                     case 0x0B:
 759                         nds = dst;
 760                         break;
 761                     default:
 762                         break;
 763                 }
 764                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 765                 asm.emitByte(op);
 766             } else {
 767                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 768             }
 769             asm.emitOperandHelper(dst, src, immediateSize(size));
 770             emitImmediate(asm, size, imm);
 771         }
 772     }
 773 
 774     public static class SSEOp extends AMD64RMOp {
 775         // @formatter:off
 776         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 778         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 780         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 781         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 782         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 786         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 787         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 788         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 789         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 790         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 791         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 792         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 793         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 794         // @formatter:on
 795 
 796         protected SSEOp(String opcode, int prefix, int op) {
 797             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 798         }
 799 
 800         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 801             this(opcode, 0, prefix, op, assertion);
 802         }
 803 
 804         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 805             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 806         }
 807     }
 808 
 809     /**
 810      * Arithmetic operation with operand order of RM, MR or MI.
 811      */
 812     public static final class AMD64BinaryArithmetic {
 813         // @formatter:off
 814         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 815         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 816         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 817         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 818         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 819         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 820         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 821         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 822         // @formatter:on
 823 
 824         private final AMD64MIOp byteImmOp;
 825         private final AMD64MROp byteMrOp;
 826         private final AMD64RMOp byteRmOp;
 827 
 828         private final AMD64MIOp immOp;
 829         private final AMD64MIOp immSxOp;
 830         private final AMD64MROp mrOp;
 831         private final AMD64RMOp rmOp;
 832 
 833         private AMD64BinaryArithmetic(String opcode, int code) {
 834             int baseOp = code << 3;
 835 
 836             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 837             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 838             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 839 
 840             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 841             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 842             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 843             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 844         }
 845 
 846         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 847             if (size == BYTE) {
 848                 return byteImmOp;
 849             } else if (sx) {
 850                 return immSxOp;
 851             } else {
 852                 return immOp;
 853             }
 854         }
 855 
 856         public AMD64MROp getMROpcode(OperandSize size) {
 857             if (size == BYTE) {
 858                 return byteMrOp;
 859             } else {
 860                 return mrOp;
 861             }
 862         }
 863 
 864         public AMD64RMOp getRMOpcode(OperandSize size) {
 865             if (size == BYTE) {
 866                 return byteRmOp;
 867             } else {
 868                 return rmOp;
 869             }
 870         }
 871     }
 872 
 873     /**
 874      * Shift operation with operand order of M1, MC or MI.
 875      */
 876     public static final class AMD64Shift {
 877         // @formatter:off
 878         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 879         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 880         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 881         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 882         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 883         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 884         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 885         // @formatter:on
 886 
 887         public final AMD64MOp m1Op;
 888         public final AMD64MOp mcOp;
 889         public final AMD64MIOp miOp;
 890 
 891         private AMD64Shift(String opcode, int code) {
 892             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 893             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 894             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 895         }
 896     }
 897 
 898     private enum VEXOpAssertion {
 899         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 900         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 901         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 902         AVX1_128ONLY(CPUFeature.AVX, null),
 903         AVX1_256ONLY(null, CPUFeature.AVX),
 904         AVX2_256ONLY(null, CPUFeature.AVX2),
 905         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 906         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 907         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 908         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 909         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 910         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null);
 911 
 912         private final CPUFeature l128feature;
 913         private final CPUFeature l256feature;
 914 
 915         private final RegisterCategory rCategory;
 916         private final RegisterCategory vCategory;
 917         private final RegisterCategory mCategory;
 918         private final RegisterCategory imm8Category;
 919 
 920         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 921             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 922         }
 923 
 924         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 925             this.l128feature = l128feature;
 926             this.l256feature = l256feature;
 927             this.rCategory = rCategory;
 928             this.vCategory = vCategory;
 929             this.mCategory = mCategory;
 930             this.imm8Category = imm8Category;
 931         }
 932 
 933         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 934             return check(arch, getLFlag(size), r, v, m, null);
 935         }
 936 
 937         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 938             return check(arch, getLFlag(size), r, v, m, imm8);
 939         }
 940 
 941         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 942             switch (l) {
 943                 case L128:
 944                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 945                     break;
 946                 case L256:
 947                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 948                     break;
 949             }
 950             if (r != null) {
 951                 assert r.getRegisterCategory().equals(rCategory);
 952             }
 953             if (v != null) {
 954                 assert v.getRegisterCategory().equals(vCategory);
 955             }
 956             if (m != null) {
 957                 assert m.getRegisterCategory().equals(mCategory);
 958             }
 959             if (imm8 != null) {
 960                 assert imm8.getRegisterCategory().equals(imm8Category);
 961             }
 962             return true;
 963         }
 964 
 965         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 966             switch (avxSize) {
 967                 case XMM:
 968                     return l128feature != null && features.contains(l128feature);
 969                 case YMM:
 970                     return l256feature != null && features.contains(l256feature);
 971                 default:
 972                     throw GraalError.shouldNotReachHere();
 973             }
 974         }
 975     }
 976 
 977     /**
 978      * Base class for VEX-encoded instructions.
 979      */
 980     public static class VexOp {
 981         protected final int pp;
 982         protected final int mmmmm;
 983         protected final int w;
 984         protected final int op;
 985 
 986         private final String opcode;
 987         protected final VEXOpAssertion assertion;
 988 
 989         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 990             this.pp = pp;
 991             this.mmmmm = mmmmm;
 992             this.w = w;
 993             this.op = op;
 994             this.opcode = opcode;
 995             this.assertion = assertion;
 996         }
 997 
 998         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 999             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
1000         }
1001 
1002         @Override
1003         public String toString() {
1004             return opcode;
1005         }
1006     }
1007 
1008     /**
1009      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1010      */
1011     public static class VexRROp extends VexOp {
1012         // @formatter:off
1013         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1014         // @formatter:on
1015 
1016         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1017             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1018         }
1019 
1020         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1021             super(opcode, pp, mmmmm, w, op, assertion);
1022         }
1023 
1024         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1025             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1026             assert op != 0x1A || op != 0x5A;
1027             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1028             asm.emitByte(op);
1029             asm.emitModRM(dst, src);
1030         }
1031     }
1032 
1033     /**
1034      * VEX-encoded instructions with an operand order of RM.
1035      */
1036     public static class VexRMOp extends VexRROp {
1037         // @formatter:off
1038         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1042         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1044         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1046         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1048         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1049         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1051         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1052         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1056         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1057         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1058         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1059         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1060         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1061         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1062         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1063         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1064         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1065         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1066         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1067         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1068         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1069         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1070         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1074         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1075         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1076         // @formatter:on
1077 
1078         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1079             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1080         }
1081 
1082         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1083             super(opcode, pp, mmmmm, w, op, assertion);
1084         }
1085 
1086         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1087             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1088             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1089             asm.emitByte(op);
1090             asm.emitOperandHelper(dst, src, 0);
1091         }
1092     }
1093 
1094     /**
1095      * VEX-encoded move instructions.
1096      * <p>
1097      * These instructions have two opcodes: op is the forward move instruction with an operand order
1098      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1099      */
1100     public static final class VexMoveOp extends VexRMOp {
1101         // @formatter:off
1102         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1104         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1106         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1110         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1112         // @formatter:on
1113 
1114         private final int opReverse;
1115 
1116         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1117             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1118         }
1119 
1120         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1121             super(opcode, pp, mmmmm, w, op, assertion);
1122             this.opReverse = opReverse;
1123         }
1124 
1125         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1126             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1127             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1128             asm.emitByte(opReverse);
1129             asm.emitOperandHelper(src, dst, 0);
1130         }
1131 
1132         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1133             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1134             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1135             asm.emitByte(opReverse);
1136             asm.emitModRM(src, dst);
1137         }
1138     }
1139 
1140     public interface VexRRIOp {
1141         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1142     }
1143 
1144     /**
1145      * VEX-encoded instructions with an operand order of RMI.
1146      */
1147     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1148         // @formatter:off
1149         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1150         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1153         // @formatter:on
1154 
1155         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1156             super(opcode, pp, mmmmm, w, op, assertion);
1157         }
1158 
1159         @Override
1160         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1161             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1162             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1163             asm.emitByte(op);
1164             asm.emitModRM(dst, src);
1165             asm.emitByte(imm8);
1166         }
1167 
1168         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1169             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1170             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1171             asm.emitByte(op);
1172             asm.emitOperandHelper(dst, src, 1);
1173             asm.emitByte(imm8);
1174         }
1175     }
1176 
1177     /**
1178      * VEX-encoded instructions with an operand order of MRI.
1179      */
1180     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1181         // @formatter:off
1182         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1183         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1184         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1187         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1188         // @formatter:on
1189 
1190         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1191             super(opcode, pp, mmmmm, w, op, assertion);
1192         }
1193 
1194         @Override
1195         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1196             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1197             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1198             asm.emitByte(op);
1199             asm.emitModRM(src, dst);
1200             asm.emitByte(imm8);
1201         }
1202 
1203         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1204             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1205             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1206             asm.emitByte(op);
1207             asm.emitOperandHelper(src, dst, 1);
1208             asm.emitByte(imm8);
1209         }
1210     }
1211 
1212     /**
1213      * VEX-encoded instructions with an operand order of RVMR.
1214      */
1215     public static class VexRVMROp extends VexOp {
1216         // @formatter:off
1217         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1218         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1219         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1220         // @formatter:on
1221 
1222         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1223             super(opcode, pp, mmmmm, w, op, assertion);
1224         }
1225 
1226         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1227             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1228             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1229             asm.emitByte(op);
1230             asm.emitModRM(dst, src2);
1231             asm.emitByte(mask.encoding() << 4);
1232         }
1233 
1234         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1235             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1236             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1237             asm.emitByte(op);
1238             asm.emitOperandHelper(dst, src2, 0);
1239             asm.emitByte(mask.encoding() << 4);
1240         }
1241     }
1242 
1243     /**
1244      * VEX-encoded instructions with an operand order of RVM.
1245      */
1246     public static class VexRVMOp extends VexOp {
1247         // @formatter:off
1248         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1250         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1252         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1254         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1256         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1260         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1264         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1268         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1272         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1276         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1280         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1282         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1298         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1300         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1304         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1311         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1312         // @formatter:on
1313 
1314         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1315             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1316         }
1317 
1318         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1319             super(opcode, pp, mmmmm, w, op, assertion);
1320         }
1321 
1322         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1323             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1324             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1325             asm.emitByte(op);
1326             asm.emitModRM(dst, src2);
1327         }
1328 
1329         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1330             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1331             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1332             asm.emitByte(op);
1333             asm.emitOperandHelper(dst, src2, 0);
1334         }
1335     }
1336 
1337     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1338         // @formatter:off
1339         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1340         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1341         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1342         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1343         // @formatter:on
1344 
1345         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1346             super(opcode, pp, mmmmm, w, op, assertion);
1347         }
1348 
1349         @Override
1350         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1351             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1352             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1353             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1354             asm.emitByte(op);
1355             asm.emitModRM(dst, src2);
1356         }
1357 
1358         @Override
1359         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1360             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1361             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1362             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1363             asm.emitByte(op);
1364             asm.emitOperandHelper(dst, src2, 0);
1365         }
1366     }
1367 
1368     public static final class VexGeneralPurposeRMVOp extends VexOp {
1369         // @formatter:off
1370         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1371         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1372         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1373         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1374         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1375         // @formatter:on
1376 
1377         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1378             super(opcode, pp, mmmmm, w, op, assertion);
1379         }
1380 
1381         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1382             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1383             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1384             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1385             asm.emitByte(op);
1386             asm.emitModRM(dst, src1);
1387         }
1388 
1389         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1390             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1391             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1392             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1393             asm.emitByte(op);
1394             asm.emitOperandHelper(dst, src1, 0);
1395         }
1396     }
1397 
1398     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1399         // @formatter:off
1400         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1401         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1402         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1403         // @formatter:on
1404         private final int ext;
1405 
1406         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1407             super(opcode, pp, mmmmm, w, op, assertion);
1408             this.ext = ext;
1409         }
1410 
1411         @Override
1412         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1413             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1414             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1415             asm.emitByte(op);
1416             asm.emitModRM(ext, src);
1417         }
1418 
1419         @Override
1420         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1421             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1422             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1423             asm.emitByte(op);
1424             asm.emitOperandHelper(ext, src, 0);
1425         }
1426     }
1427 
1428     /**
1429      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1430      */
1431     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1432         // @formatter:off
1433         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1434         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1435         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1436         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1437         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1438         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1439         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1440         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1441         // @formatter:on
1442 
1443         private final int immOp;
1444         private final int r;
1445 
1446         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1447             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1448             this.immOp = immOp;
1449             this.r = r;
1450         }
1451 
1452         @Override
1453         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1454             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1455             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1456             asm.emitByte(immOp);
1457             asm.emitModRM(r, src);
1458             asm.emitByte(imm8);
1459         }
1460     }
1461 
1462     public static final class VexMaskMoveOp extends VexOp {
1463         // @formatter:off
1464         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1465         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1466         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1467         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1468         // @formatter:on
1469 
1470         private final int opReverse;
1471 
1472         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1473             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1474         }
1475 
1476         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1477             super(opcode, pp, mmmmm, w, op, assertion);
1478             this.opReverse = opReverse;
1479         }
1480 
1481         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1482             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1483             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1484             asm.emitByte(op);
1485             asm.emitOperandHelper(dst, src, 0);
1486         }
1487 
1488         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1489             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1490             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1491             asm.emitByte(opReverse);
1492             asm.emitOperandHelper(src, dst, 0);
1493         }
1494     }
1495 
1496     /**
1497      * VEX-encoded instructions with an operand order of RVMI.
1498      */
1499     public static final class VexRVMIOp extends VexOp {
1500         // @formatter:off
1501         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1502         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1503         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1504         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1505         // @formatter:on
1506 
1507         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1508             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1509         }
1510 
1511         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1512             super(opcode, pp, mmmmm, w, op, assertion);
1513         }
1514 
1515         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1516             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1517             assert (imm8 & 0xFF) == imm8;
1518             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1519             asm.emitByte(op);
1520             asm.emitModRM(dst, src2);
1521             asm.emitByte(imm8);
1522         }
1523 
1524         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1525             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1526             assert (imm8 & 0xFF) == imm8;
1527             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1528             asm.emitByte(op);
1529             asm.emitOperandHelper(dst, src2, 1);
1530             asm.emitByte(imm8);
1531         }
1532     }
1533 
1534     /**
1535      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1536      * comparison operator.
1537      */
1538     public static final class VexFloatCompareOp extends VexOp {
1539         // @formatter:off
1540         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1541         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1542         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1543         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1544         // @formatter:on
1545 
1546         public enum Predicate {
1547             EQ_OQ(0x00),
1548             LT_OS(0x01),
1549             LE_OS(0x02),
1550             UNORD_Q(0x03),
1551             NEQ_UQ(0x04),
1552             NLT_US(0x05),
1553             NLE_US(0x06),
1554             ORD_Q(0x07),
1555             EQ_UQ(0x08),
1556             NGE_US(0x09),
1557             NGT_US(0x0a),
1558             FALSE_OQ(0x0b),
1559             NEQ_OQ(0x0c),
1560             GE_OS(0x0d),
1561             GT_OS(0x0e),
1562             TRUE_UQ(0x0f),
1563             EQ_OS(0x10),
1564             LT_OQ(0x11),
1565             LE_OQ(0x12),
1566             UNORD_S(0x13),
1567             NEQ_US(0x14),
1568             NLT_UQ(0x15),
1569             NLE_UQ(0x16),
1570             ORD_S(0x17),
1571             EQ_US(0x18),
1572             NGE_UQ(0x19),
1573             NGT_UQ(0x1a),
1574             FALSE_OS(0x1b),
1575             NEQ_OS(0x1c),
1576             GE_OQ(0x1d),
1577             GT_OQ(0x1e),
1578             TRUE_US(0x1f);
1579 
1580             private int imm8;
1581 
1582             Predicate(int imm8) {
1583                 this.imm8 = imm8;
1584             }
1585 
1586             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1587                 if (unorderedIsTrue) {
1588                     switch (condition) {
1589                         case EQ:
1590                             return EQ_UQ;
1591                         case NE:
1592                             return NEQ_UQ;
1593                         case LT:
1594                             return NGE_UQ;
1595                         case LE:
1596                             return NGT_UQ;
1597                         case GT:
1598                             return NLE_UQ;
1599                         case GE:
1600                             return NLT_UQ;
1601                         default:
1602                             throw GraalError.shouldNotReachHere();
1603                     }
1604                 } else {
1605                     switch (condition) {
1606                         case EQ:
1607                             return EQ_OQ;
1608                         case NE:
1609                             return NEQ_OQ;
1610                         case LT:
1611                             return LT_OQ;
1612                         case LE:
1613                             return LE_OQ;
1614                         case GT:
1615                             return GT_OQ;
1616                         case GE:
1617                             return GE_OQ;
1618                         default:
1619                             throw GraalError.shouldNotReachHere();
1620                     }
1621                 }
1622             }
1623         }
1624 
1625         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1626             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1627         }
1628 
1629         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1630             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1631             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1632             asm.emitByte(op);
1633             asm.emitModRM(dst, src2);
1634             asm.emitByte(p.imm8);
1635         }
1636 
1637         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1638             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1639             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1640             asm.emitByte(op);
1641             asm.emitOperandHelper(dst, src2, 1);
1642             asm.emitByte(p.imm8);
1643         }
1644     }
1645 
1646     public final void addl(AMD64Address dst, int imm32) {
1647         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1648     }
1649 
1650     public final void addl(Register dst, int imm32) {
1651         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1652     }
1653 
1654     public final void addl(Register dst, Register src) {
1655         ADD.rmOp.emit(this, DWORD, dst, src);
1656     }
1657 
1658     public final void addpd(Register dst, Register src) {
1659         SSEOp.ADD.emit(this, PD, dst, src);
1660     }
1661 
1662     public final void addpd(Register dst, AMD64Address src) {
1663         SSEOp.ADD.emit(this, PD, dst, src);
1664     }
1665 
1666     public final void addsd(Register dst, Register src) {
1667         SSEOp.ADD.emit(this, SD, dst, src);
1668     }
1669 
1670     public final void addsd(Register dst, AMD64Address src) {
1671         SSEOp.ADD.emit(this, SD, dst, src);
1672     }
1673 
1674     private void addrNop4() {
1675         // 4 bytes: NOP DWORD PTR [EAX+0]
1676         emitByte(0x0F);
1677         emitByte(0x1F);
1678         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1679         emitByte(0); // 8-bits offset (1 byte)
1680     }
1681 
1682     private void addrNop5() {
1683         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1684         emitByte(0x0F);
1685         emitByte(0x1F);
1686         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1687         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1688         emitByte(0); // 8-bits offset (1 byte)
1689     }
1690 
1691     private void addrNop7() {
1692         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1693         emitByte(0x0F);
1694         emitByte(0x1F);
1695         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1696         emitInt(0); // 32-bits offset (4 bytes)
1697     }
1698 
1699     private void addrNop8() {
1700         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1701         emitByte(0x0F);
1702         emitByte(0x1F);
1703         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1704         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1705         emitInt(0); // 32-bits offset (4 bytes)
1706     }
1707 
1708     public final void andl(Register dst, int imm32) {
1709         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1710     }
1711 
1712     public final void andl(Register dst, Register src) {
1713         AND.rmOp.emit(this, DWORD, dst, src);
1714     }
1715 
1716     public final void andpd(Register dst, Register src) {
1717         SSEOp.AND.emit(this, PD, dst, src);
1718     }
1719 
1720     public final void andpd(Register dst, AMD64Address src) {
1721         SSEOp.AND.emit(this, PD, dst, src);
1722     }
1723 
1724     public final void bsfq(Register dst, Register src) {
1725         prefixq(dst, src);
1726         emitByte(0x0F);
1727         emitByte(0xBC);
1728         emitModRM(dst, src);
1729     }
1730 
1731     public final void bsrl(Register dst, Register src) {
1732         prefix(dst, src);
1733         emitByte(0x0F);
1734         emitByte(0xBD);
1735         emitModRM(dst, src);
1736     }
1737 
1738     public final void bswapl(Register reg) {
1739         prefix(reg);
1740         emitByte(0x0F);
1741         emitModRM(1, reg);
1742     }
1743 
1744     public final void cdql() {
1745         emitByte(0x99);
1746     }
1747 
1748     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1749         prefix(dst, src);
1750         emitByte(0x0F);
1751         emitByte(0x40 | cc.getValue());
1752         emitModRM(dst, src);
1753     }
1754 
1755     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1756         prefix(src, dst);
1757         emitByte(0x0F);
1758         emitByte(0x40 | cc.getValue());
1759         emitOperandHelper(dst, src, 0);
1760     }
1761 
1762     public final void cmpb(Register dst, Register src) {
1763         CMP.byteRmOp.emit(this, BYTE, dst, src);
1764     }
1765 
1766     public final void cmpw(Register dst, Register src) {
1767         CMP.rmOp.emit(this, WORD, dst, src);
1768     }
1769 
1770     public final void cmpl(Register dst, int imm32) {
1771         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1772     }
1773 
1774     public final void cmpl(Register dst, Register src) {
1775         CMP.rmOp.emit(this, DWORD, dst, src);
1776     }
1777 
1778     public final void cmpl(Register dst, AMD64Address src) {
1779         CMP.rmOp.emit(this, DWORD, dst, src);
1780     }
1781 
1782     public final void cmpl(AMD64Address dst, int imm32) {
1783         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1784     }
1785 
1786     /**
1787      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1788      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1789      * values were equal, and cleared otherwise.
1790      */
1791     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1792         prefixb(adr, reg);
1793         emitByte(0x0F);
1794         emitByte(0xB0);
1795         emitOperandHelper(reg, adr, 0);
1796     }
1797 
1798     /**
1799      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1800      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1801      * compared values were equal, and cleared otherwise.
1802      */
1803     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1804         emitByte(0x66); // Switch to 16-bit mode.
1805         prefix(adr, reg);
1806         emitByte(0x0F);
1807         emitByte(0xB1);
1808         emitOperandHelper(reg, adr, 0);
1809     }
1810 
1811     /**
1812      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1813      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1814      * compared values were equal, and cleared otherwise.
1815      */
1816     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1817         prefix(adr, reg);
1818         emitByte(0x0F);
1819         emitByte(0xB1);
1820         emitOperandHelper(reg, adr, 0);
1821     }
1822 
1823     public final void cvtsi2sdl(Register dst, Register src) {
1824         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1825     }
1826 
1827     public final void cvttsd2sil(Register dst, Register src) {
1828         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1829     }
1830 
1831     public final void decl(AMD64Address dst) {
1832         prefix(dst);
1833         emitByte(0xFF);
1834         emitOperandHelper(1, dst, 0);
1835     }
1836 
1837     public final void divsd(Register dst, Register src) {
1838         SSEOp.DIV.emit(this, SD, dst, src);
1839     }
1840 
1841     public final void hlt() {
1842         emitByte(0xF4);
1843     }
1844 
1845     public final void imull(Register dst, Register src, int value) {
1846         if (isByte(value)) {
1847             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1848         } else {
1849             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1850         }
1851     }
1852 
1853     public final void incl(AMD64Address dst) {
1854         prefix(dst);
1855         emitByte(0xFF);
1856         emitOperandHelper(0, dst, 0);
1857     }
1858 
1859     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1860         int shortSize = 2;
1861         int longSize = 6;
1862         long disp = jumpTarget - position();
1863         if (!forceDisp32 && isByte(disp - shortSize)) {
1864             // 0111 tttn #8-bit disp
1865             emitByte(0x70 | cc.getValue());
1866             emitByte((int) ((disp - shortSize) & 0xFF));
1867         } else {
1868             // 0000 1111 1000 tttn #32-bit disp
1869             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1870             emitByte(0x0F);
1871             emitByte(0x80 | cc.getValue());
1872             emitInt((int) (disp - longSize));
1873         }
1874     }
1875 
1876     public final void jcc(ConditionFlag cc, Label l) {
1877         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1878         if (l.isBound()) {
1879             jcc(cc, l.position(), false);
1880         } else {
1881             // Note: could eliminate cond. jumps to this jump if condition
1882             // is the same however, seems to be rather unlikely case.
1883             // Note: use jccb() if label to be bound is very close to get
1884             // an 8-bit displacement
1885             l.addPatchAt(position());
1886             emitByte(0x0F);
1887             emitByte(0x80 | cc.getValue());
1888             emitInt(0);
1889         }
1890 
1891     }
1892 
1893     public final void jccb(ConditionFlag cc, Label l) {
1894         if (l.isBound()) {
1895             int shortSize = 2;
1896             int entry = l.position();
1897             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1898             long disp = entry - position();
1899             // 0111 tttn #8-bit disp
1900             emitByte(0x70 | cc.getValue());
1901             emitByte((int) ((disp - shortSize) & 0xFF));
1902         } else {
1903             l.addPatchAt(position());
1904             emitByte(0x70 | cc.getValue());
1905             emitByte(0);
1906         }
1907     }
1908 
1909     public final void jmp(int jumpTarget, boolean forceDisp32) {
1910         int shortSize = 2;
1911         int longSize = 5;
1912         long disp = jumpTarget - position();
1913         if (!forceDisp32 && isByte(disp - shortSize)) {
1914             emitByte(0xEB);
1915             emitByte((int) ((disp - shortSize) & 0xFF));
1916         } else {
1917             emitByte(0xE9);
1918             emitInt((int) (disp - longSize));
1919         }
1920     }
1921 
1922     @Override
1923     public final void jmp(Label l) {
1924         if (l.isBound()) {
1925             jmp(l.position(), false);
1926         } else {
1927             // By default, forward jumps are always 32-bit displacements, since
1928             // we can't yet know where the label will be bound. If you're sure that
1929             // the forward jump will not run beyond 256 bytes, use jmpb to
1930             // force an 8-bit displacement.
1931 
1932             l.addPatchAt(position());
1933             emitByte(0xE9);
1934             emitInt(0);
1935         }
1936     }
1937 
1938     public final void jmp(Register entry) {
1939         prefix(entry);
1940         emitByte(0xFF);
1941         emitModRM(4, entry);
1942     }
1943 
1944     public final void jmp(AMD64Address adr) {
1945         prefix(adr);
1946         emitByte(0xFF);
1947         emitOperandHelper(AMD64.rsp, adr, 0);
1948     }
1949 
1950     public final void jmpb(Label l) {
1951         if (l.isBound()) {
1952             int shortSize = 2;
1953             int entry = l.position();
1954             assert isByte((entry - position()) + shortSize) : "Dispacement too large for a short jmp";
1955             long offs = entry - position();
1956             emitByte(0xEB);
1957             emitByte((int) ((offs - shortSize) & 0xFF));
1958         } else {
1959 
1960             l.addPatchAt(position());
1961             emitByte(0xEB);
1962             emitByte(0);
1963         }
1964     }
1965 
1966     public final void lead(Register dst, AMD64Address src) {
1967         prefix(src, dst);
1968         emitByte(0x8D);
1969         emitOperandHelper(dst, src, 0);
1970     }
1971 
1972     public final void leaq(Register dst, AMD64Address src) {
1973         prefixq(src, dst);
1974         emitByte(0x8D);
1975         emitOperandHelper(dst, src, 0);
1976     }
1977 
1978     public final void leave() {
1979         emitByte(0xC9);
1980     }
1981 
1982     public final void lock() {
1983         emitByte(0xF0);
1984     }
1985 
1986     public final void movapd(Register dst, Register src) {
1987         assert inRC(XMM, dst) && inRC(XMM, src);
1988         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1989         emitByte(0x28);
1990         emitModRM(dst, src);
1991     }
1992 
1993     public final void movaps(Register dst, Register src) {
1994         assert inRC(XMM, dst) && inRC(XMM, src);
1995         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1996         emitByte(0x28);
1997         emitModRM(dst, src);
1998     }
1999 
2000     public final void movb(AMD64Address dst, int imm8) {
2001         prefix(dst);
2002         emitByte(0xC6);
2003         emitOperandHelper(0, dst, 1);
2004         emitByte(imm8);
2005     }
2006 
2007     public final void movb(AMD64Address dst, Register src) {
2008         assert inRC(CPU, src) : "must have byte register";
2009         prefixb(dst, src);
2010         emitByte(0x88);
2011         emitOperandHelper(src, dst, 0);
2012     }
2013 
2014     public final void movl(Register dst, int imm32) {
2015         movl(dst, imm32, false);
2016     }
2017 
2018     public final void movl(Register dst, int imm32, boolean annotateImm) {
2019         int insnPos = position();
2020         prefix(dst);
2021         emitByte(0xB8 + encode(dst));
2022         int immPos = position();
2023         emitInt(imm32);
2024         int nextInsnPos = position();
2025         if (annotateImm && codePatchingAnnotationConsumer != null) {
2026             codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2027         }
2028     }
2029 
2030     public final void movl(Register dst, Register src) {
2031         prefix(dst, src);
2032         emitByte(0x8B);
2033         emitModRM(dst, src);
2034     }
2035 
2036     public final void movl(Register dst, AMD64Address src) {
2037         prefix(src, dst);
2038         emitByte(0x8B);
2039         emitOperandHelper(dst, src, 0);
2040     }
2041 
2042     /**
2043      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2044      */
2045     public final void movl(Register dst, AMD64Address src, boolean wide) {
2046         prefix(src, dst);
2047         emitByte(0x8B);
2048         emitOperandHelper(dst, src, wide, 0);
2049     }
2050 
2051     public final void movl(AMD64Address dst, int imm32) {
2052         prefix(dst);
2053         emitByte(0xC7);
2054         emitOperandHelper(0, dst, 4);
2055         emitInt(imm32);
2056     }
2057 
2058     public final void movl(AMD64Address dst, Register src) {
2059         prefix(dst, src);
2060         emitByte(0x89);
2061         emitOperandHelper(src, dst, 0);
2062     }
2063 
2064     /**
2065      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2066      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2067      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2068      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2069      */
2070     public final void movlpd(Register dst, AMD64Address src) {
2071         assert inRC(XMM, dst);
2072         simdPrefix(dst, dst, src, PD, P_0F, false);
2073         emitByte(0x12);
2074         emitOperandHelper(dst, src, 0);
2075     }
2076 
2077     public final void movlhps(Register dst, Register src) {
2078         assert inRC(XMM, dst) && inRC(XMM, src);
2079         simdPrefix(dst, src, src, PS, P_0F, false);
2080         emitByte(0x16);
2081         emitModRM(dst, src);
2082     }
2083 
2084     public final void movq(Register dst, AMD64Address src) {
2085         movq(dst, src, false);
2086     }
2087 
2088     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2089         if (inRC(XMM, dst)) {
2090             // Insn: MOVQ xmm, r/m64
2091             // Code: F3 0F 7E /r
2092             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2093             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2094             // when applicable.
2095             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2096             emitByte(0x7E);
2097             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2098         } else {
2099             // gpr version of movq
2100             prefixq(src, dst);
2101             emitByte(0x8B);
2102             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2103         }
2104     }
2105 
2106     public final void movq(Register dst, Register src) {
2107         assert inRC(CPU, dst) && inRC(CPU, src);
2108         prefixq(dst, src);
2109         emitByte(0x8B);
2110         emitModRM(dst, src);
2111     }
2112 
2113     public final void movq(AMD64Address dst, Register src) {
2114         if (inRC(XMM, src)) {
2115             // Insn: MOVQ r/m64, xmm
2116             // Code: 66 0F D6 /r
2117             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2118             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2119             // when applicable.
2120             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2121             emitByte(0xD6);
2122             emitOperandHelper(src, dst, 0);
2123         } else {
2124             // gpr version of movq
2125             prefixq(dst, src);
2126             emitByte(0x89);
2127             emitOperandHelper(src, dst, 0);
2128         }
2129     }
2130 
2131     public final void movsbl(Register dst, AMD64Address src) {
2132         prefix(src, dst);
2133         emitByte(0x0F);
2134         emitByte(0xBE);
2135         emitOperandHelper(dst, src, 0);
2136     }
2137 
2138     public final void movsbl(Register dst, Register src) {
2139         prefix(dst, false, src, true);
2140         emitByte(0x0F);
2141         emitByte(0xBE);
2142         emitModRM(dst, src);
2143     }
2144 
2145     public final void movsbq(Register dst, AMD64Address src) {
2146         prefixq(src, dst);
2147         emitByte(0x0F);
2148         emitByte(0xBE);
2149         emitOperandHelper(dst, src, 0);
2150     }
2151 
2152     public final void movsbq(Register dst, Register src) {
2153         prefixq(dst, src);
2154         emitByte(0x0F);
2155         emitByte(0xBE);
2156         emitModRM(dst, src);
2157     }
2158 
2159     public final void movsd(Register dst, Register src) {
2160         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2161     }
2162 
2163     public final void movsd(Register dst, AMD64Address src) {
2164         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2165     }
2166 
2167     public final void movsd(AMD64Address dst, Register src) {
2168         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2169     }
2170 
2171     public final void movss(Register dst, Register src) {
2172         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2173     }
2174 
2175     public final void movss(Register dst, AMD64Address src) {
2176         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2177     }
2178 
2179     public final void movss(AMD64Address dst, Register src) {
2180         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2181     }
2182 
2183     public final void mulpd(Register dst, Register src) {
2184         SSEOp.MUL.emit(this, PD, dst, src);
2185     }
2186 
2187     public final void mulpd(Register dst, AMD64Address src) {
2188         SSEOp.MUL.emit(this, PD, dst, src);
2189     }
2190 
2191     public final void mulsd(Register dst, Register src) {
2192         SSEOp.MUL.emit(this, SD, dst, src);
2193     }
2194 
2195     public final void mulsd(Register dst, AMD64Address src) {
2196         SSEOp.MUL.emit(this, SD, dst, src);
2197     }
2198 
2199     public final void mulss(Register dst, Register src) {
2200         SSEOp.MUL.emit(this, SS, dst, src);
2201     }
2202 
2203     public final void movswl(Register dst, AMD64Address src) {
2204         prefix(src, dst);
2205         emitByte(0x0F);
2206         emitByte(0xBF);
2207         emitOperandHelper(dst, src, 0);
2208     }
2209 
2210     public final void movw(AMD64Address dst, int imm16) {
2211         emitByte(0x66); // switch to 16-bit mode
2212         prefix(dst);
2213         emitByte(0xC7);
2214         emitOperandHelper(0, dst, 2);
2215         emitShort(imm16);
2216     }
2217 
2218     public final void movw(AMD64Address dst, Register src) {
2219         emitByte(0x66);
2220         prefix(dst, src);
2221         emitByte(0x89);
2222         emitOperandHelper(src, dst, 0);
2223     }
2224 
2225     public final void movzbl(Register dst, AMD64Address src) {
2226         prefix(src, dst);
2227         emitByte(0x0F);
2228         emitByte(0xB6);
2229         emitOperandHelper(dst, src, 0);
2230     }
2231 
2232     public final void movzbl(Register dst, Register src) {
2233         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2234     }
2235 
2236     public final void movzbq(Register dst, Register src) {
2237         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2238     }
2239 
2240     public final void movzwl(Register dst, AMD64Address src) {
2241         prefix(src, dst);
2242         emitByte(0x0F);
2243         emitByte(0xB7);
2244         emitOperandHelper(dst, src, 0);
2245     }
2246 
2247     public final void negl(Register dst) {
2248         NEG.emit(this, DWORD, dst);
2249     }
2250 
2251     public final void notl(Register dst) {
2252         NOT.emit(this, DWORD, dst);
2253     }
2254 
2255     public final void notq(Register dst) {
2256         NOT.emit(this, QWORD, dst);
2257     }
2258 
2259     @Override
2260     public final void ensureUniquePC() {
2261         nop();
2262     }
2263 
2264     public final void nop() {
2265         nop(1);
2266     }
2267 
2268     public void nop(int count) {
2269         int i = count;
2270         if (UseNormalNop) {
2271             assert i > 0 : " ";
2272             // The fancy nops aren't currently recognized by debuggers making it a
2273             // pain to disassemble code while debugging. If assert are on clearly
2274             // speed is not an issue so simply use the single byte traditional nop
2275             // to do alignment.
2276 
2277             for (; i > 0; i--) {
2278                 emitByte(0x90);
2279             }
2280             return;
2281         }
2282 
2283         if (UseAddressNop) {
2284             //
2285             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2286             // 1: 0x90
2287             // 2: 0x66 0x90
2288             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2289             // 4: 0x0F 0x1F 0x40 0x00
2290             // 5: 0x0F 0x1F 0x44 0x00 0x00
2291             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2292             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2293             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2294             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2295             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2296             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2297 
2298             // The rest coding is AMD specific - use consecutive Address nops
2299 
2300             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2301             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2302             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2303             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2304             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2305             // Size prefixes (0x66) are added for larger sizes
2306 
2307             while (i >= 22) {
2308                 i -= 11;
2309                 emitByte(0x66); // size prefix
2310                 emitByte(0x66); // size prefix
2311                 emitByte(0x66); // size prefix
2312                 addrNop8();
2313             }
2314             // Generate first nop for size between 21-12
2315             switch (i) {
2316                 case 21:
2317                     i -= 11;
2318                     emitByte(0x66); // size prefix
2319                     emitByte(0x66); // size prefix
2320                     emitByte(0x66); // size prefix
2321                     addrNop8();
2322                     break;
2323                 case 20:
2324                 case 19:
2325                     i -= 10;
2326                     emitByte(0x66); // size prefix
2327                     emitByte(0x66); // size prefix
2328                     addrNop8();
2329                     break;
2330                 case 18:
2331                 case 17:
2332                     i -= 9;
2333                     emitByte(0x66); // size prefix
2334                     addrNop8();
2335                     break;
2336                 case 16:
2337                 case 15:
2338                     i -= 8;
2339                     addrNop8();
2340                     break;
2341                 case 14:
2342                 case 13:
2343                     i -= 7;
2344                     addrNop7();
2345                     break;
2346                 case 12:
2347                     i -= 6;
2348                     emitByte(0x66); // size prefix
2349                     addrNop5();
2350                     break;
2351                 default:
2352                     assert i < 12;
2353             }
2354 
2355             // Generate second nop for size between 11-1
2356             switch (i) {
2357                 case 11:
2358                     emitByte(0x66); // size prefix
2359                     emitByte(0x66); // size prefix
2360                     emitByte(0x66); // size prefix
2361                     addrNop8();
2362                     break;
2363                 case 10:
2364                     emitByte(0x66); // size prefix
2365                     emitByte(0x66); // size prefix
2366                     addrNop8();
2367                     break;
2368                 case 9:
2369                     emitByte(0x66); // size prefix
2370                     addrNop8();
2371                     break;
2372                 case 8:
2373                     addrNop8();
2374                     break;
2375                 case 7:
2376                     addrNop7();
2377                     break;
2378                 case 6:
2379                     emitByte(0x66); // size prefix
2380                     addrNop5();
2381                     break;
2382                 case 5:
2383                     addrNop5();
2384                     break;
2385                 case 4:
2386                     addrNop4();
2387                     break;
2388                 case 3:
2389                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2390                     emitByte(0x66); // size prefix
2391                     emitByte(0x66); // size prefix
2392                     emitByte(0x90); // nop
2393                     break;
2394                 case 2:
2395                     emitByte(0x66); // size prefix
2396                     emitByte(0x90); // nop
2397                     break;
2398                 case 1:
2399                     emitByte(0x90); // nop
2400                     break;
2401                 default:
2402                     assert i == 0;
2403             }
2404             return;
2405         }
2406 
2407         // Using nops with size prefixes "0x66 0x90".
2408         // From AMD Optimization Guide:
2409         // 1: 0x90
2410         // 2: 0x66 0x90
2411         // 3: 0x66 0x66 0x90
2412         // 4: 0x66 0x66 0x66 0x90
2413         // 5: 0x66 0x66 0x90 0x66 0x90
2414         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2415         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2416         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2417         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2418         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2419         //
2420         while (i > 12) {
2421             i -= 4;
2422             emitByte(0x66); // size prefix
2423             emitByte(0x66);
2424             emitByte(0x66);
2425             emitByte(0x90); // nop
2426         }
2427         // 1 - 12 nops
2428         if (i > 8) {
2429             if (i > 9) {
2430                 i -= 1;
2431                 emitByte(0x66);
2432             }
2433             i -= 3;
2434             emitByte(0x66);
2435             emitByte(0x66);
2436             emitByte(0x90);
2437         }
2438         // 1 - 8 nops
2439         if (i > 4) {
2440             if (i > 6) {
2441                 i -= 1;
2442                 emitByte(0x66);
2443             }
2444             i -= 3;
2445             emitByte(0x66);
2446             emitByte(0x66);
2447             emitByte(0x90);
2448         }
2449         switch (i) {
2450             case 4:
2451                 emitByte(0x66);
2452                 emitByte(0x66);
2453                 emitByte(0x66);
2454                 emitByte(0x90);
2455                 break;
2456             case 3:
2457                 emitByte(0x66);
2458                 emitByte(0x66);
2459                 emitByte(0x90);
2460                 break;
2461             case 2:
2462                 emitByte(0x66);
2463                 emitByte(0x90);
2464                 break;
2465             case 1:
2466                 emitByte(0x90);
2467                 break;
2468             default:
2469                 assert i == 0;
2470         }
2471     }
2472 
2473     public final void orl(Register dst, Register src) {
2474         OR.rmOp.emit(this, DWORD, dst, src);
2475     }
2476 
2477     public final void orl(Register dst, int imm32) {
2478         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2479     }
2480 
2481     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2482     // -----
2483     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2484 
2485     public final void packuswb(Register dst, Register src) {
2486         assert inRC(XMM, dst) && inRC(XMM, src);
2487         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2488         simdPrefix(dst, dst, src, PD, P_0F, false);
2489         emitByte(0x67);
2490         emitModRM(dst, src);
2491     }
2492 
2493     public final void pop(Register dst) {
2494         prefix(dst);
2495         emitByte(0x58 + encode(dst));
2496     }
2497 
2498     public void popfq() {
2499         emitByte(0x9D);
2500     }
2501 
2502     public final void ptest(Register dst, Register src) {
2503         assert supports(CPUFeature.SSE4_1);
2504         assert inRC(XMM, dst) && inRC(XMM, src);
2505         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2506         emitByte(0x17);
2507         emitModRM(dst, src);
2508     }
2509 
2510     public final void pcmpeqb(Register dst, Register src) {
2511         assert supports(CPUFeature.SSE2);
2512         assert inRC(XMM, dst) && inRC(XMM, src);
2513         simdPrefix(dst, dst, src, PD, P_0F, false);
2514         emitByte(0x74);
2515         emitModRM(dst, src);
2516     }
2517 
2518     public final void pcmpeqw(Register dst, Register src) {
2519         assert supports(CPUFeature.SSE2);
2520         assert inRC(XMM, dst) && inRC(XMM, src);
2521         simdPrefix(dst, dst, src, PD, P_0F, false);
2522         emitByte(0x75);
2523         emitModRM(dst, src);
2524     }
2525 
2526     public final void pcmpeqd(Register dst, Register src) {
2527         assert supports(CPUFeature.SSE2);
2528         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2529         simdPrefix(dst, dst, src, PD, P_0F, false);
2530         emitByte(0x76);
2531         emitModRM(dst, src);
2532     }
2533 
2534     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2535         assert supports(CPUFeature.SSE4_2);
2536         assert inRC(XMM, dst);
2537         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2538         emitByte(0x61);
2539         emitOperandHelper(dst, src, 0);
2540         emitByte(imm8);
2541     }
2542 
2543     public final void pcmpestri(Register dst, Register src, int imm8) {
2544         assert supports(CPUFeature.SSE4_2);
2545         assert inRC(XMM, dst) && inRC(XMM, src);
2546         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2547         emitByte(0x61);
2548         emitModRM(dst, src);
2549         emitByte(imm8);
2550     }
2551 
2552     public final void pmovmskb(Register dst, Register src) {
2553         assert supports(CPUFeature.SSE2);
2554         assert inRC(CPU, dst) && inRC(XMM, src);
2555         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2556         emitByte(0xD7);
2557         emitModRM(dst, src);
2558     }
2559 
2560     // Insn: VPMOVZXBW xmm1, xmm2/m64
2561 
2562     public final void pmovzxbw(Register dst, AMD64Address src) {
2563         assert supports(CPUFeature.SSE4_1);
2564         assert inRC(XMM, dst);
2565         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2566         emitByte(0x30);
2567         emitOperandHelper(dst, src, 0);
2568     }
2569 
2570     public final void pmovzxbw(Register dst, Register src) {
2571         assert supports(CPUFeature.SSE4_1);
2572         assert inRC(XMM, dst) && inRC(XMM, src);
2573         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2574         emitByte(0x30);
2575         emitModRM(dst, src);
2576     }
2577 
2578     public final void push(Register src) {
2579         prefix(src);
2580         emitByte(0x50 + encode(src));
2581     }
2582 
2583     public void pushfq() {
2584         emitByte(0x9c);
2585     }
2586 
2587     public final void paddd(Register dst, Register src) {
2588         assert inRC(XMM, dst) && inRC(XMM, src);
2589         simdPrefix(dst, dst, src, PD, P_0F, false);
2590         emitByte(0xFE);
2591         emitModRM(dst, src);
2592     }
2593 
2594     public final void paddq(Register dst, Register src) {
2595         assert inRC(XMM, dst) && inRC(XMM, src);
2596         simdPrefix(dst, dst, src, PD, P_0F, false);
2597         emitByte(0xD4);
2598         emitModRM(dst, src);
2599     }
2600 
2601     public final void pextrw(Register dst, Register src, int imm8) {
2602         assert inRC(CPU, dst) && inRC(XMM, src);
2603         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2604         emitByte(0xC5);
2605         emitModRM(dst, src);
2606         emitByte(imm8);
2607     }
2608 
2609     public final void pinsrw(Register dst, Register src, int imm8) {
2610         assert inRC(XMM, dst) && inRC(CPU, src);
2611         simdPrefix(dst, dst, src, PD, P_0F, false);
2612         emitByte(0xC4);
2613         emitModRM(dst, src);
2614         emitByte(imm8);
2615     }
2616 
2617     public final void por(Register dst, Register src) {
2618         assert inRC(XMM, dst) && inRC(XMM, src);
2619         simdPrefix(dst, dst, src, PD, P_0F, false);
2620         emitByte(0xEB);
2621         emitModRM(dst, src);
2622     }
2623 
2624     public final void pand(Register dst, Register src) {
2625         assert inRC(XMM, dst) && inRC(XMM, src);
2626         simdPrefix(dst, dst, src, PD, P_0F, false);
2627         emitByte(0xDB);
2628         emitModRM(dst, src);
2629     }
2630 
2631     public final void pxor(Register dst, Register src) {
2632         assert inRC(XMM, dst) && inRC(XMM, src);
2633         simdPrefix(dst, dst, src, PD, P_0F, false);
2634         emitByte(0xEF);
2635         emitModRM(dst, src);
2636     }
2637 
2638     public final void pslld(Register dst, int imm8) {
2639         assert isUByte(imm8) : "invalid value";
2640         assert inRC(XMM, dst);
2641         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2642         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2643         emitByte(0x72);
2644         emitModRM(6, dst);
2645         emitByte(imm8 & 0xFF);
2646     }
2647 
2648     public final void psllq(Register dst, Register shift) {
2649         assert inRC(XMM, dst) && inRC(XMM, shift);
2650         simdPrefix(dst, dst, shift, PD, P_0F, false);
2651         emitByte(0xF3);
2652         emitModRM(dst, shift);
2653     }
2654 
2655     public final void psllq(Register dst, int imm8) {
2656         assert isUByte(imm8) : "invalid value";
2657         assert inRC(XMM, dst);
2658         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2659         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2660         emitByte(0x73);
2661         emitModRM(6, dst);
2662         emitByte(imm8);
2663     }
2664 
2665     public final void psrad(Register dst, int imm8) {
2666         assert isUByte(imm8) : "invalid value";
2667         assert inRC(XMM, dst);
2668         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2669         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2670         emitByte(0x72);
2671         emitModRM(4, dst);
2672         emitByte(imm8);
2673     }
2674 
2675     public final void psrld(Register dst, int imm8) {
2676         assert isUByte(imm8) : "invalid value";
2677         assert inRC(XMM, dst);
2678         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2679         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2680         emitByte(0x72);
2681         emitModRM(2, dst);
2682         emitByte(imm8);
2683     }
2684 
2685     public final void psrlq(Register dst, int imm8) {
2686         assert isUByte(imm8) : "invalid value";
2687         assert inRC(XMM, dst);
2688         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2689         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2690         emitByte(0x73);
2691         emitModRM(2, dst);
2692         emitByte(imm8);
2693     }
2694 
2695     public final void psrldq(Register dst, int imm8) {
2696         assert isUByte(imm8) : "invalid value";
2697         assert inRC(XMM, dst);
2698         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2699         emitByte(0x73);
2700         emitModRM(3, dst);
2701         emitByte(imm8);
2702     }
2703 
2704     public final void pshufb(Register dst, Register src) {
2705         assert supports(CPUFeature.SSSE3);
2706         assert inRC(XMM, dst) && inRC(XMM, src);
2707         simdPrefix(dst, dst, src, PD, P_0F38, false);
2708         emitByte(0x00);
2709         emitModRM(dst, src);
2710     }
2711 
2712     public final void pshuflw(Register dst, Register src, int imm8) {
2713         assert supports(CPUFeature.SSE2);
2714         assert isUByte(imm8) : "invalid value";
2715         assert inRC(XMM, dst) && inRC(XMM, src);
2716         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2717         emitByte(0x70);
2718         emitModRM(dst, src);
2719         emitByte(imm8);
2720     }
2721 
2722     public final void pshufd(Register dst, Register src, int imm8) {
2723         assert isUByte(imm8) : "invalid value";
2724         assert inRC(XMM, dst) && inRC(XMM, src);
2725         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2726         emitByte(0x70);
2727         emitModRM(dst, src);
2728         emitByte(imm8);
2729     }
2730 
2731     public final void psubd(Register dst, Register src) {
2732         assert inRC(XMM, dst) && inRC(XMM, src);
2733         simdPrefix(dst, dst, src, PD, P_0F, false);
2734         emitByte(0xFA);
2735         emitModRM(dst, src);
2736     }
2737 
2738     public final void punpcklbw(Register dst, Register src) {
2739         assert supports(CPUFeature.SSE2);
2740         assert inRC(XMM, dst) && inRC(XMM, src);
2741         simdPrefix(dst, dst, src, PD, P_0F, false);
2742         emitByte(0x60);
2743         emitModRM(dst, src);
2744     }
2745 
2746     public final void rcpps(Register dst, Register src) {
2747         assert inRC(XMM, dst) && inRC(XMM, src);
2748         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2749         emitByte(0x53);
2750         emitModRM(dst, src);
2751     }
2752 
2753     public final void ret(int imm16) {
2754         if (imm16 == 0) {
2755             emitByte(0xC3);
2756         } else {
2757             emitByte(0xC2);
2758             emitShort(imm16);
2759         }
2760     }
2761 
2762     public final void sarl(Register dst, int imm8) {
2763         prefix(dst);
2764         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2765         if (imm8 == 1) {
2766             emitByte(0xD1);
2767             emitModRM(7, dst);
2768         } else {
2769             emitByte(0xC1);
2770             emitModRM(7, dst);
2771             emitByte(imm8);
2772         }
2773     }
2774 
2775     public final void shll(Register dst, int imm8) {
2776         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2777         prefix(dst);
2778         if (imm8 == 1) {
2779             emitByte(0xD1);
2780             emitModRM(4, dst);
2781         } else {
2782             emitByte(0xC1);
2783             emitModRM(4, dst);
2784             emitByte(imm8);
2785         }
2786     }
2787 
2788     public final void shll(Register dst) {
2789         // Multiply dst by 2, CL times.
2790         prefix(dst);
2791         emitByte(0xD3);
2792         emitModRM(4, dst);
2793     }
2794 
2795     // Insn: SHLX r32a, r/m32, r32b
2796 
2797     public final void shlxl(Register dst, Register src1, Register src2) {
2798         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2799     }
2800 
2801     public final void shrl(Register dst, int imm8) {
2802         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2803         prefix(dst);
2804         emitByte(0xC1);
2805         emitModRM(5, dst);
2806         emitByte(imm8);
2807     }
2808 
2809     public final void shrl(Register dst) {
2810         // Unsigned divide dst by 2, CL times.
2811         prefix(dst);
2812         emitByte(0xD3);
2813         emitModRM(5, dst);
2814     }
2815 
2816     public final void subl(AMD64Address dst, int imm32) {
2817         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2818     }
2819 
2820     public final void subl(Register dst, int imm32) {
2821         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2822     }
2823 
2824     public final void subl(Register dst, Register src) {
2825         SUB.rmOp.emit(this, DWORD, dst, src);
2826     }
2827 
2828     public final void subpd(Register dst, Register src) {
2829         SSEOp.SUB.emit(this, PD, dst, src);
2830     }
2831 
2832     public final void subsd(Register dst, Register src) {
2833         SSEOp.SUB.emit(this, SD, dst, src);
2834     }
2835 
2836     public final void subsd(Register dst, AMD64Address src) {
2837         SSEOp.SUB.emit(this, SD, dst, src);
2838     }
2839 
2840     public final void testl(Register dst, int imm32) {
2841         // not using emitArith because test
2842         // doesn't support sign-extension of
2843         // 8bit operands
2844         if (dst.encoding == 0) {
2845             emitByte(0xA9);
2846         } else {
2847             prefix(dst);
2848             emitByte(0xF7);
2849             emitModRM(0, dst);
2850         }
2851         emitInt(imm32);
2852     }
2853 
2854     public final void testl(Register dst, Register src) {
2855         prefix(dst, src);
2856         emitByte(0x85);
2857         emitModRM(dst, src);
2858     }
2859 
2860     public final void testl(Register dst, AMD64Address src) {
2861         prefix(src, dst);
2862         emitByte(0x85);
2863         emitOperandHelper(dst, src, 0);
2864     }
2865 
2866     public final void unpckhpd(Register dst, Register src) {
2867         assert inRC(XMM, dst) && inRC(XMM, src);
2868         simdPrefix(dst, dst, src, PD, P_0F, false);
2869         emitByte(0x15);
2870         emitModRM(dst, src);
2871     }
2872 
2873     public final void unpcklpd(Register dst, Register src) {
2874         assert inRC(XMM, dst) && inRC(XMM, src);
2875         simdPrefix(dst, dst, src, PD, P_0F, false);
2876         emitByte(0x14);
2877         emitModRM(dst, src);
2878     }
2879 
2880     public final void xorl(Register dst, Register src) {
2881         XOR.rmOp.emit(this, DWORD, dst, src);
2882     }
2883 
2884     public final void xorpd(Register dst, Register src) {
2885         SSEOp.XOR.emit(this, PD, dst, src);
2886     }
2887 
2888     public final void xorps(Register dst, Register src) {
2889         SSEOp.XOR.emit(this, PS, dst, src);
2890     }
2891 
2892     protected final void decl(Register dst) {
2893         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2894         prefix(dst);
2895         emitByte(0xFF);
2896         emitModRM(1, dst);
2897     }
2898 
2899     protected final void incl(Register dst) {
2900         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2901         prefix(dst);
2902         emitByte(0xFF);
2903         emitModRM(0, dst);
2904     }
2905 
2906     public final void addq(Register dst, int imm32) {
2907         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2908     }
2909 
2910     public final void addq(AMD64Address dst, int imm32) {
2911         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2912     }
2913 
2914     public final void addq(Register dst, Register src) {
2915         ADD.rmOp.emit(this, QWORD, dst, src);
2916     }
2917 
2918     public final void addq(AMD64Address dst, Register src) {
2919         ADD.mrOp.emit(this, QWORD, dst, src);
2920     }
2921 
2922     public final void andq(Register dst, int imm32) {
2923         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2924     }
2925 
2926     public final void bsrq(Register dst, Register src) {
2927         prefixq(dst, src);
2928         emitByte(0x0F);
2929         emitByte(0xBD);
2930         emitModRM(dst, src);
2931     }
2932 
2933     public final void bswapq(Register reg) {
2934         prefixq(reg);
2935         emitByte(0x0F);
2936         emitByte(0xC8 + encode(reg));
2937     }
2938 
2939     public final void cdqq() {
2940         rexw();
2941         emitByte(0x99);
2942     }
2943 
2944     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
2945         prefixq(dst, src);
2946         emitByte(0x0F);
2947         emitByte(0x40 | cc.getValue());
2948         emitModRM(dst, src);
2949     }
2950 
2951     public final void setb(ConditionFlag cc, Register dst) {
2952         prefix(dst, true);
2953         emitByte(0x0F);
2954         emitByte(0x90 | cc.getValue());
2955         emitModRM(0, dst);
2956     }
2957 
2958     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
2959         prefixq(src, dst);
2960         emitByte(0x0F);
2961         emitByte(0x40 | cc.getValue());
2962         emitOperandHelper(dst, src, 0);
2963     }
2964 
2965     public final void cmpq(Register dst, int imm32) {
2966         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2967     }
2968 
2969     public final void cmpq(Register dst, Register src) {
2970         CMP.rmOp.emit(this, QWORD, dst, src);
2971     }
2972 
2973     public final void cmpq(Register dst, AMD64Address src) {
2974         CMP.rmOp.emit(this, QWORD, dst, src);
2975     }
2976 
2977     public final void cmpxchgq(Register reg, AMD64Address adr) {
2978         prefixq(adr, reg);
2979         emitByte(0x0F);
2980         emitByte(0xB1);
2981         emitOperandHelper(reg, adr, 0);
2982     }
2983 
2984     public final void cvtdq2pd(Register dst, Register src) {
2985         assert inRC(XMM, dst) && inRC(XMM, src);
2986         simdPrefix(dst, Register.None, src, SS, P_0F, false);
2987         emitByte(0xE6);
2988         emitModRM(dst, src);
2989     }
2990 
2991     public final void cvtsi2sdq(Register dst, Register src) {
2992         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
2993     }
2994 
2995     public final void cvttsd2siq(Register dst, Register src) {
2996         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
2997     }
2998 
2999     public final void cvttpd2dq(Register dst, Register src) {
3000         assert inRC(XMM, dst) && inRC(XMM, src);
3001         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3002         emitByte(0xE6);
3003         emitModRM(dst, src);
3004     }
3005 
3006     public final void decq(Register dst) {
3007         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3008         prefixq(dst);
3009         emitByte(0xFF);
3010         emitModRM(1, dst);
3011     }
3012 
3013     public final void decq(AMD64Address dst) {
3014         DEC.emit(this, QWORD, dst);
3015     }
3016 
3017     public final void imulq(Register dst, Register src) {
3018         prefixq(dst, src);
3019         emitByte(0x0F);
3020         emitByte(0xAF);
3021         emitModRM(dst, src);
3022     }
3023 
3024     public final void incq(Register dst) {
3025         // Don't use it directly. Use Macroincrementq() instead.
3026         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3027         prefixq(dst);
3028         emitByte(0xFF);
3029         emitModRM(0, dst);
3030     }
3031 
3032     public final void incq(AMD64Address dst) {
3033         INC.emit(this, QWORD, dst);
3034     }
3035 
3036     public final void movq(Register dst, long imm64) {
3037         movq(dst, imm64, false);
3038     }
3039 
3040     public final void movq(Register dst, long imm64, boolean annotateImm) {
3041         int insnPos = position();
3042         prefixq(dst);
3043         emitByte(0xB8 + encode(dst));
3044         int immPos = position();
3045         emitLong(imm64);
3046         int nextInsnPos = position();
3047         if (annotateImm && codePatchingAnnotationConsumer != null) {
3048             codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3049         }
3050     }
3051 
3052     public final void movslq(Register dst, int imm32) {
3053         prefixq(dst);
3054         emitByte(0xC7);
3055         emitModRM(0, dst);
3056         emitInt(imm32);
3057     }
3058 
3059     public final void movdq(Register dst, AMD64Address src) {
3060         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3061     }
3062 
3063     public final void movdq(AMD64Address dst, Register src) {
3064         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3065     }
3066 
3067     public final void movdq(Register dst, Register src) {
3068         if (inRC(XMM, dst) && inRC(CPU, src)) {
3069             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3070         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3071             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3072         } else {
3073             throw new InternalError("should not reach here");
3074         }
3075     }
3076 
3077     public final void movdl(Register dst, Register src) {
3078         if (inRC(XMM, dst) && inRC(CPU, src)) {
3079             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3080         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3081             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3082         } else {
3083             throw new InternalError("should not reach here");
3084         }
3085     }
3086 
3087     public final void movdl(Register dst, AMD64Address src) {
3088         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3089     }
3090 
3091     public final void movddup(Register dst, Register src) {
3092         assert supports(CPUFeature.SSE3);
3093         assert inRC(XMM, dst) && inRC(XMM, src);
3094         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3095         emitByte(0x12);
3096         emitModRM(dst, src);
3097     }
3098 
3099     public final void movdqu(Register dst, AMD64Address src) {
3100         assert inRC(XMM, dst);
3101         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3102         emitByte(0x6F);
3103         emitOperandHelper(dst, src, 0);
3104     }
3105 
3106     public final void movdqu(Register dst, Register src) {
3107         assert inRC(XMM, dst) && inRC(XMM, src);
3108         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3109         emitByte(0x6F);
3110         emitModRM(dst, src);
3111     }
3112 
3113     // Insn: VMOVDQU xmm2/m128, xmm1
3114 
3115     public final void movdqu(AMD64Address dst, Register src) {
3116         assert inRC(XMM, src);
3117         // Code: VEX.128.F3.0F.WIG 7F /r
3118         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3119         emitByte(0x7F);
3120         emitOperandHelper(src, dst, 0);
3121     }
3122 
3123     public final void movslq(AMD64Address dst, int imm32) {
3124         prefixq(dst);
3125         emitByte(0xC7);
3126         emitOperandHelper(0, dst, 4);
3127         emitInt(imm32);
3128     }
3129 
3130     public final void movslq(Register dst, AMD64Address src) {
3131         prefixq(src, dst);
3132         emitByte(0x63);
3133         emitOperandHelper(dst, src, 0);
3134     }
3135 
3136     public final void movslq(Register dst, Register src) {
3137         prefixq(dst, src);
3138         emitByte(0x63);
3139         emitModRM(dst, src);
3140     }
3141 
3142     public final void negq(Register dst) {
3143         prefixq(dst);
3144         emitByte(0xF7);
3145         emitModRM(3, dst);
3146     }
3147 
3148     public final void orq(Register dst, Register src) {
3149         OR.rmOp.emit(this, QWORD, dst, src);
3150     }
3151 
3152     public final void shlq(Register dst, int imm8) {
3153         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3154         prefixq(dst);
3155         if (imm8 == 1) {
3156             emitByte(0xD1);
3157             emitModRM(4, dst);
3158         } else {
3159             emitByte(0xC1);
3160             emitModRM(4, dst);
3161             emitByte(imm8);
3162         }
3163     }
3164 
3165     public final void shlq(Register dst) {
3166         // Multiply dst by 2, CL times.
3167         prefixq(dst);
3168         emitByte(0xD3);
3169         emitModRM(4, dst);
3170     }
3171 
3172     public final void shrq(Register dst, int imm8) {
3173         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3174         prefixq(dst);
3175         if (imm8 == 1) {
3176             emitByte(0xD1);
3177             emitModRM(5, dst);
3178         } else {
3179             emitByte(0xC1);
3180             emitModRM(5, dst);
3181             emitByte(imm8);
3182         }
3183     }
3184 
3185     public final void shrq(Register dst) {
3186         prefixq(dst);
3187         emitByte(0xD3);
3188         // Unsigned divide dst by 2, CL times.
3189         emitModRM(5, dst);
3190     }
3191 
3192     public final void sbbq(Register dst, Register src) {
3193         SBB.rmOp.emit(this, QWORD, dst, src);
3194     }
3195 
3196     public final void subq(Register dst, int imm32) {
3197         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3198     }
3199 
3200     public final void subq(AMD64Address dst, int imm32) {
3201         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3202     }
3203 
3204     public final void subqWide(Register dst, int imm32) {
3205         // don't use the sign-extending version, forcing a 32-bit immediate
3206         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3207     }
3208 
3209     public final void subq(Register dst, Register src) {
3210         SUB.rmOp.emit(this, QWORD, dst, src);
3211     }
3212 
3213     public final void testq(Register dst, Register src) {
3214         prefixq(dst, src);
3215         emitByte(0x85);
3216         emitModRM(dst, src);
3217     }
3218 
3219     public final void btrq(Register src, int imm8) {
3220         prefixq(src);
3221         emitByte(0x0F);
3222         emitByte(0xBA);
3223         emitModRM(6, src);
3224         emitByte(imm8);
3225     }
3226 
3227     public final void xaddb(AMD64Address dst, Register src) {
3228         prefixb(dst, src);
3229         emitByte(0x0F);
3230         emitByte(0xC0);
3231         emitOperandHelper(src, dst, 0);
3232     }
3233 
3234     public final void xaddw(AMD64Address dst, Register src) {
3235         emitByte(0x66); // Switch to 16-bit mode.
3236         prefix(dst, src);
3237         emitByte(0x0F);
3238         emitByte(0xC1);
3239         emitOperandHelper(src, dst, 0);
3240     }
3241 
3242     public final void xaddl(AMD64Address dst, Register src) {
3243         prefix(dst, src);
3244         emitByte(0x0F);
3245         emitByte(0xC1);
3246         emitOperandHelper(src, dst, 0);
3247     }
3248 
3249     public final void xaddq(AMD64Address dst, Register src) {
3250         prefixq(dst, src);
3251         emitByte(0x0F);
3252         emitByte(0xC1);
3253         emitOperandHelper(src, dst, 0);
3254     }
3255 
3256     public final void xchgb(Register dst, AMD64Address src) {
3257         prefixb(src, dst);
3258         emitByte(0x86);
3259         emitOperandHelper(dst, src, 0);
3260     }
3261 
3262     public final void xchgw(Register dst, AMD64Address src) {
3263         emitByte(0x66);
3264         prefix(src, dst);
3265         emitByte(0x87);
3266         emitOperandHelper(dst, src, 0);
3267     }
3268 
3269     public final void xchgl(Register dst, AMD64Address src) {
3270         prefix(src, dst);
3271         emitByte(0x87);
3272         emitOperandHelper(dst, src, 0);
3273     }
3274 
3275     public final void xchgq(Register dst, AMD64Address src) {
3276         prefixq(src, dst);
3277         emitByte(0x87);
3278         emitOperandHelper(dst, src, 0);
3279     }
3280 
3281     public final void membar(int barriers) {
3282         if (target.isMP) {
3283             // We only have to handle StoreLoad
3284             if ((barriers & STORE_LOAD) != 0) {
3285                 // All usable chips support "locked" instructions which suffice
3286                 // as barriers, and are much faster than the alternative of
3287                 // using cpuid instruction. We use here a locked add [rsp],0.
3288                 // This is conveniently otherwise a no-op except for blowing
3289                 // flags.
3290                 // Any change to this code may need to revisit other places in
3291                 // the code where this idiom is used, in particular the
3292                 // orderAccess code.
3293                 lock();
3294                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3295             }
3296         }
3297     }
3298 
3299     @Override
3300     protected final void patchJumpTarget(int branch, int branchTarget) {
3301         int op = getByte(branch);
3302         assert op == 0xE8 // call
3303                         || op == 0x00 // jump table entry
3304                         || op == 0xE9 // jmp
3305                         || op == 0xEB // short jmp
3306                         || (op & 0xF0) == 0x70 // short jcc
3307                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3308         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3309 
3310         if (op == 0x00) {
3311             int offsetToJumpTableBase = getShort(branch + 1);
3312             int jumpTableBase = branch - offsetToJumpTableBase;
3313             int imm32 = branchTarget - jumpTableBase;
3314             emitInt(imm32, branch);
3315         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3316 
3317             // short offset operators (jmp and jcc)
3318             final int imm8 = branchTarget - (branch + 2);
3319             /*
3320              * Since a wrongly patched short branch can potentially lead to working but really bad
3321              * behaving code we should always fail with an exception instead of having an assert.
3322              */
3323             if (!NumUtil.isByte(imm8)) {
3324                 throw new InternalError("branch displacement out of range: " + imm8);
3325             }
3326             emitByte(imm8, branch + 1);
3327 
3328         } else {
3329 
3330             int off = 1;
3331             if (op == 0x0F) {
3332                 off = 2;
3333             }
3334 
3335             int imm32 = branchTarget - (branch + 4 + off);
3336             emitInt(imm32, branch + off);
3337         }
3338     }
3339 
3340     public void nullCheck(AMD64Address address) {
3341         testl(AMD64.rax, address);
3342     }
3343 
3344     @Override
3345     public void align(int modulus) {
3346         if (position() % modulus != 0) {
3347             nop(modulus - (position() % modulus));
3348         }
3349     }
3350 
3351     /**
3352      * Emits a direct call instruction. Note that the actual call target is not specified, because
3353      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3354      * responsible to add the call address to the appropriate patching tables.
3355      */
3356     public final void call() {
3357         annotatePatchingImmediate(1, 4);
3358         emitByte(0xE8);
3359         emitInt(0);
3360     }
3361 
3362     public final void call(Register src) {
3363         prefix(src);
3364         emitByte(0xFF);
3365         emitModRM(2, src);
3366     }
3367 
3368     public final void int3() {
3369         emitByte(0xCC);
3370     }
3371 
3372     public final void pause() {
3373         emitByte(0xF3);
3374         emitByte(0x90);
3375     }
3376 
3377     private void emitx87(int b1, int b2, int i) {
3378         assert 0 <= i && i < 8 : "illegal stack offset";
3379         emitByte(b1);
3380         emitByte(b2 + i);
3381     }
3382 
3383     public final void fldd(AMD64Address src) {
3384         emitByte(0xDD);
3385         emitOperandHelper(0, src, 0);
3386     }
3387 
3388     public final void flds(AMD64Address src) {
3389         emitByte(0xD9);
3390         emitOperandHelper(0, src, 0);
3391     }
3392 
3393     public final void fldln2() {
3394         emitByte(0xD9);
3395         emitByte(0xED);
3396     }
3397 
3398     public final void fldlg2() {
3399         emitByte(0xD9);
3400         emitByte(0xEC);
3401     }
3402 
3403     public final void fyl2x() {
3404         emitByte(0xD9);
3405         emitByte(0xF1);
3406     }
3407 
3408     public final void fstps(AMD64Address src) {
3409         emitByte(0xD9);
3410         emitOperandHelper(3, src, 0);
3411     }
3412 
3413     public final void fstpd(AMD64Address src) {
3414         emitByte(0xDD);
3415         emitOperandHelper(3, src, 0);
3416     }
3417 
3418     private void emitFPUArith(int b1, int b2, int i) {
3419         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3420         emitByte(b1);
3421         emitByte(b2 + i);
3422     }
3423 
3424     public void ffree(int i) {
3425         emitFPUArith(0xDD, 0xC0, i);
3426     }
3427 
3428     public void fincstp() {
3429         emitByte(0xD9);
3430         emitByte(0xF7);
3431     }
3432 
3433     public void fxch(int i) {
3434         emitFPUArith(0xD9, 0xC8, i);
3435     }
3436 
3437     public void fnstswAX() {
3438         emitByte(0xDF);
3439         emitByte(0xE0);
3440     }
3441 
3442     public void fwait() {
3443         emitByte(0x9B);
3444     }
3445 
3446     public void fprem() {
3447         emitByte(0xD9);
3448         emitByte(0xF8);
3449     }
3450 
3451     public final void fsin() {
3452         emitByte(0xD9);
3453         emitByte(0xFE);
3454     }
3455 
3456     public final void fcos() {
3457         emitByte(0xD9);
3458         emitByte(0xFF);
3459     }
3460 
3461     public final void fptan() {
3462         emitByte(0xD9);
3463         emitByte(0xF2);
3464     }
3465 
3466     public final void fstp(int i) {
3467         emitx87(0xDD, 0xD8, i);
3468     }
3469 
3470     @Override
3471     public AMD64Address makeAddress(Register base, int displacement) {
3472         return new AMD64Address(base, displacement);
3473     }
3474 
3475     @Override
3476     public AMD64Address getPlaceholder(int instructionStartPosition) {
3477         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3478     }
3479 
3480     private void prefetchPrefix(AMD64Address src) {
3481         prefix(src);
3482         emitByte(0x0F);
3483     }
3484 
3485     public void prefetchnta(AMD64Address src) {
3486         prefetchPrefix(src);
3487         emitByte(0x18);
3488         emitOperandHelper(0, src, 0);
3489     }
3490 
3491     void prefetchr(AMD64Address src) {
3492         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3493         prefetchPrefix(src);
3494         emitByte(0x0D);
3495         emitOperandHelper(0, src, 0);
3496     }
3497 
3498     public void prefetcht0(AMD64Address src) {
3499         assert supports(CPUFeature.SSE);
3500         prefetchPrefix(src);
3501         emitByte(0x18);
3502         emitOperandHelper(1, src, 0);
3503     }
3504 
3505     public void prefetcht1(AMD64Address src) {
3506         assert supports(CPUFeature.SSE);
3507         prefetchPrefix(src);
3508         emitByte(0x18);
3509         emitOperandHelper(2, src, 0);
3510     }
3511 
3512     public void prefetcht2(AMD64Address src) {
3513         assert supports(CPUFeature.SSE);
3514         prefix(src);
3515         emitByte(0x0f);
3516         emitByte(0x18);
3517         emitOperandHelper(3, src, 0);
3518     }
3519 
3520     public void prefetchw(AMD64Address src) {
3521         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3522         prefix(src);
3523         emitByte(0x0f);
3524         emitByte(0x0D);
3525         emitOperandHelper(1, src, 0);
3526     }
3527 
3528     public void rdtsc() {
3529         emitByte(0x0F);
3530         emitByte(0x31);
3531     }
3532 
3533     /**
3534      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3535      * to crash the program (debugging etc.).
3536      */
3537     public void illegal() {
3538         emitByte(0x0f);
3539         emitByte(0x0b);
3540     }
3541 
3542     public void lfence() {
3543         emitByte(0x0f);
3544         emitByte(0xae);
3545         emitByte(0xe8);
3546     }
3547 
3548     public final void vptest(Register dst, Register src) {
3549         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3550     }
3551 
3552     public final void vpxor(Register dst, Register nds, Register src) {
3553         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3554     }
3555 
3556     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3557         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3558     }
3559 
3560     public final void vmovdqu(Register dst, AMD64Address src) {
3561         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3562     }
3563 
3564     public final void vmovdqu(AMD64Address dst, Register src) {
3565         assert inRC(XMM, src);
3566         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3567     }
3568 
3569     public final void vpmovzxbw(Register dst, AMD64Address src) {
3570         assert supports(CPUFeature.AVX2);
3571         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3572     }
3573 
3574     public final void vzeroupper() {
3575         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3576         emitByte(0x77);
3577     }
3578 
3579     // Insn: KORTESTD k1, k2
3580 
3581     // This instruction produces ZF or CF flags
3582     public final void kortestd(Register src1, Register src2) {
3583         assert supports(CPUFeature.AVX512BW);
3584         assert inRC(MASK, src1) && inRC(MASK, src2);
3585         // Code: VEX.L0.66.0F.W1 98 /r
3586         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3587         emitByte(0x98);
3588         emitModRM(src1, src2);
3589     }
3590 
3591     // Insn: KORTESTQ k1, k2
3592 
3593     // This instruction produces ZF or CF flags
3594     public final void kortestq(Register src1, Register src2) {
3595         assert supports(CPUFeature.AVX512BW);
3596         assert inRC(MASK, src1) && inRC(MASK, src2);
3597         // Code: VEX.L0.0F.W1 98 /r
3598         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3599         emitByte(0x98);
3600         emitModRM(src1, src2);
3601     }
3602 
3603     public final void kmovd(Register dst, Register src) {
3604         assert supports(CPUFeature.AVX512BW);
3605         assert inRC(MASK, dst) || inRC(CPU, dst);
3606         assert inRC(MASK, src) || inRC(CPU, src);
3607         assert !(inRC(CPU, dst) && inRC(CPU, src));
3608 
3609         if (inRC(MASK, dst)) {
3610             if (inRC(MASK, src)) {
3611                 // kmovd(KRegister dst, KRegister src):
3612                 // Insn: KMOVD k1, k2/m32
3613                 // Code: VEX.L0.66.0F.W1 90 /r
3614                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3615                 emitByte(0x90);
3616                 emitModRM(dst, src);
3617             } else {
3618                 // kmovd(KRegister dst, Register src)
3619                 // Insn: KMOVD k1, r32
3620                 // Code: VEX.L0.F2.0F.W0 92 /r
3621                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3622                 emitByte(0x92);
3623                 emitModRM(dst, src);
3624             }
3625         } else {
3626             if (inRC(MASK, src)) {
3627                 // kmovd(Register dst, KRegister src)
3628                 // Insn: KMOVD r32, k1
3629                 // Code: VEX.L0.F2.0F.W0 93 /r
3630                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3631                 emitByte(0x93);
3632                 emitModRM(dst, src);
3633             } else {
3634                 throw GraalError.shouldNotReachHere();
3635             }
3636         }
3637     }
3638 
3639     public final void kmovq(Register dst, Register src) {
3640         assert supports(CPUFeature.AVX512BW);
3641         assert inRC(MASK, dst) || inRC(CPU, dst);
3642         assert inRC(MASK, src) || inRC(CPU, src);
3643         assert !(inRC(CPU, dst) && inRC(CPU, src));
3644 
3645         if (inRC(MASK, dst)) {
3646             if (inRC(MASK, src)) {
3647                 // kmovq(KRegister dst, KRegister src):
3648                 // Insn: KMOVQ k1, k2/m64
3649                 // Code: VEX.L0.0F.W1 90 /r
3650                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3651                 emitByte(0x90);
3652                 emitModRM(dst, src);
3653             } else {
3654                 // kmovq(KRegister dst, Register src)
3655                 // Insn: KMOVQ k1, r64
3656                 // Code: VEX.L0.F2.0F.W1 92 /r
3657                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3658                 emitByte(0x92);
3659                 emitModRM(dst, src);
3660             }
3661         } else {
3662             if (inRC(MASK, src)) {
3663                 // kmovq(Register dst, KRegister src)
3664                 // Insn: KMOVQ r64, k1
3665                 // Code: VEX.L0.F2.0F.W1 93 /r
3666                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3667                 emitByte(0x93);
3668                 emitModRM(dst, src);
3669             } else {
3670                 throw GraalError.shouldNotReachHere();
3671             }
3672         }
3673     }
3674 
3675     // Insn: KTESTD k1, k2
3676 
3677     public final void ktestd(Register src1, Register src2) {
3678         assert supports(CPUFeature.AVX512BW);
3679         assert inRC(MASK, src1) && inRC(MASK, src2);
3680         // Code: VEX.L0.66.0F.W1 99 /r
3681         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3682         emitByte(0x99);
3683         emitModRM(src1, src2);
3684     }
3685 
3686     public final void evmovdqu64(Register dst, AMD64Address src) {
3687         assert supports(CPUFeature.AVX512F);
3688         assert inRC(XMM, dst);
3689         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3690         emitByte(0x6F);
3691         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3692     }
3693 
3694     // Insn: VPMOVZXBW zmm1, m256
3695 
3696     public final void evpmovzxbw(Register dst, AMD64Address src) {
3697         assert supports(CPUFeature.AVX512BW);
3698         assert inRC(XMM, dst);
3699         // Code: EVEX.512.66.0F38.WIG 30 /r
3700         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3701         emitByte(0x30);
3702         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3703     }
3704 
3705     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3706         assert supports(CPUFeature.AVX512BW);
3707         assert inRC(MASK, kdst) && inRC(XMM, nds);
3708         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3709         emitByte(0x74);
3710         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3711     }
3712 
3713     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3714     // -----
3715     // Insn: VMOVDQU16 zmm1, m512
3716 
3717     public final void evmovdqu16(Register dst, AMD64Address src) {
3718         assert supports(CPUFeature.AVX512BW);
3719         assert inRC(XMM, dst);
3720         // Code: EVEX.512.F2.0F.W1 6F /r
3721         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3722         emitByte(0x6F);
3723         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3724     }
3725 
3726     // Insn: VMOVDQU16 zmm1, k1:z, m512
3727 
3728     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3729         assert supports(CPUFeature.AVX512BW);
3730         assert inRC(XMM, dst) && inRC(MASK, mask);
3731         // Code: EVEX.512.F2.0F.W1 6F /r
3732         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3733         emitByte(0x6F);
3734         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3735     }
3736 
3737     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3738     // -----
3739     // Insn: VMOVDQU16 m512, zmm1
3740 
3741     public final void evmovdqu16(AMD64Address dst, Register src) {
3742         assert supports(CPUFeature.AVX512BW);
3743         assert inRC(XMM, src);
3744         // Code: EVEX.512.F2.0F.W1 7F /r
3745         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3746         emitByte(0x7F);
3747         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3748     }
3749 
3750     // Insn: VMOVDQU16 m512, k1, zmm1
3751 
3752     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3753         assert supports(CPUFeature.AVX512BW);
3754         assert inRC(MASK, mask) && inRC(XMM, src);
3755         // Code: EVEX.512.F2.0F.W1 7F /r
3756         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3757         emitByte(0x7F);
3758         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3759     }
3760 
3761     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3762     // -----
3763     // Insn: VPBROADCASTW zmm1, reg
3764 
3765     public final void evpbroadcastw(Register dst, Register src) {
3766         assert supports(CPUFeature.AVX512BW);
3767         assert inRC(XMM, dst) && inRC(CPU, src);
3768         // Code: EVEX.512.66.0F38.W0 7B /r
3769         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3770         emitByte(0x7B);
3771         emitModRM(dst, src);
3772     }
3773 
3774     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3775     // -----
3776     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3777 
3778     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3779         assert supports(CPUFeature.AVX512BW);
3780         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3781         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3782         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3783         emitByte(0x3E);
3784         emitModRM(kdst, src);
3785         emitByte(vcc);
3786     }
3787 
3788     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3789     // -----
3790     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3791 
3792     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3793         assert supports(CPUFeature.AVX512BW);
3794         assert inRC(MASK, kdst) && inRC(MASK, mask);
3795         assert inRC(XMM, nds) && inRC(XMM, src);
3796         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3797         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3798         emitByte(0x3E);
3799         emitModRM(kdst, src);
3800         emitByte(vcc);
3801     }
3802 
3803     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3804     // -----
3805     // Insn: VPMOVWB m256, zmm2
3806 
3807     public final void evpmovwb(AMD64Address dst, Register src) {
3808         assert supports(CPUFeature.AVX512BW);
3809         assert inRC(XMM, src);
3810         // Code: EVEX.512.F3.0F38.W0 30 /r
3811         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3812         emitByte(0x30);
3813         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3814     }
3815 
3816     // Insn: VPMOVWB m256, k1, zmm2
3817 
3818     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
3819         assert supports(CPUFeature.AVX512BW);
3820         assert inRC(MASK, mask) && inRC(XMM, src);
3821         // Code: EVEX.512.F3.0F38.W0 30 /r
3822         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3823         emitByte(0x30);
3824         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3825     }
3826 
3827     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
3828     // -----
3829     // Insn: VPMOVZXBW zmm1, k1, m256
3830 
3831     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
3832         assert supports(CPUFeature.AVX512BW);
3833         assert inRC(MASK, mask) && inRC(XMM, dst);
3834         // Code: EVEX.512.66.0F38.WIG 30 /r
3835         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3836         emitByte(0x30);
3837         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3838     }
3839 
3840 }