1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.amd64.AMD64.CPUFeature.AVX512BW;
  31 import static jdk.vm.ci.amd64.AMD64.CPUFeature.AVX512CD;
  32 import static jdk.vm.ci.amd64.AMD64.CPUFeature.AVX512DQ;
  33 import static jdk.vm.ci.amd64.AMD64.CPUFeature.AVX512F;
  34 import static jdk.vm.ci.amd64.AMD64.CPUFeature.AVX512VL;
  35 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  36 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  37 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIntelNops;
  38 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  44 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  45 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  46 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  47 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  48 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  49 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L512;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  68 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  69 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  70 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  71 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  72 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  73 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  74 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  75 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  76 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  77 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  78 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  79 
  80 import java.util.EnumSet;
  81 
  82 import org.graalvm.compiler.asm.Label;
  83 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  84 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  85 import org.graalvm.compiler.core.common.calc.Condition;
  86 import org.graalvm.compiler.debug.GraalError;
  87 
  88 import jdk.vm.ci.amd64.AMD64;
  89 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  90 import jdk.vm.ci.code.Register;
  91 import jdk.vm.ci.code.Register.RegisterCategory;
  92 import jdk.vm.ci.code.TargetDescription;
  93 
  94 /**
  95  * This class implements an assembler that can encode most X86 instructions.
  96  */
  97 public class AMD64Assembler extends AMD64BaseAssembler {
  98 
  99     /**
 100      * Constructs an assembler for the AMD64 architecture.
 101      */
 102     public AMD64Assembler(TargetDescription target) {
 103         super(target);
 104     }
 105 
 106     /**
 107      * The x86 condition codes used for conditional jumps/moves.
 108      */
 109     public enum ConditionFlag {
 110         Zero(0x4, "|zero|"),
 111         NotZero(0x5, "|nzero|"),
 112         Equal(0x4, "="),
 113         NotEqual(0x5, "!="),
 114         Less(0xc, "<"),
 115         LessEqual(0xe, "<="),
 116         Greater(0xf, ">"),
 117         GreaterEqual(0xd, ">="),
 118         Below(0x2, "|<|"),
 119         BelowEqual(0x6, "|<=|"),
 120         Above(0x7, "|>|"),
 121         AboveEqual(0x3, "|>=|"),
 122         Overflow(0x0, "|of|"),
 123         NoOverflow(0x1, "|nof|"),
 124         CarrySet(0x2, "|carry|"),
 125         CarryClear(0x3, "|ncarry|"),
 126         Negative(0x8, "|neg|"),
 127         Positive(0x9, "|pos|"),
 128         Parity(0xa, "|par|"),
 129         NoParity(0xb, "|npar|");
 130 
 131         private final int value;
 132         private final String operator;
 133 
 134         ConditionFlag(int value, String operator) {
 135             this.value = value;
 136             this.operator = operator;
 137         }
 138 
 139         public ConditionFlag negate() {
 140             switch (this) {
 141                 case Zero:
 142                     return NotZero;
 143                 case NotZero:
 144                     return Zero;
 145                 case Equal:
 146                     return NotEqual;
 147                 case NotEqual:
 148                     return Equal;
 149                 case Less:
 150                     return GreaterEqual;
 151                 case LessEqual:
 152                     return Greater;
 153                 case Greater:
 154                     return LessEqual;
 155                 case GreaterEqual:
 156                     return Less;
 157                 case Below:
 158                     return AboveEqual;
 159                 case BelowEqual:
 160                     return Above;
 161                 case Above:
 162                     return BelowEqual;
 163                 case AboveEqual:
 164                     return Below;
 165                 case Overflow:
 166                     return NoOverflow;
 167                 case NoOverflow:
 168                     return Overflow;
 169                 case CarrySet:
 170                     return CarryClear;
 171                 case CarryClear:
 172                     return CarrySet;
 173                 case Negative:
 174                     return Positive;
 175                 case Positive:
 176                     return Negative;
 177                 case Parity:
 178                     return NoParity;
 179                 case NoParity:
 180                     return Parity;
 181             }
 182             throw new IllegalArgumentException();
 183         }
 184 
 185         public int getValue() {
 186             return value;
 187         }
 188 
 189         @Override
 190         public String toString() {
 191             return operator;
 192         }
 193     }
 194 
 195     /**
 196      * Operand size and register type constraints.
 197      */
 198     private enum OpAssertion {
 199         ByteAssertion(CPU, CPU, BYTE),
 200         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 201         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 202         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 203         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 204         QwordAssertion(CPU, CPU, QWORD),
 205         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 206         PackedFloatAssertion(XMM, XMM, PS, PD),
 207         SingleAssertion(XMM, XMM, SS),
 208         DoubleAssertion(XMM, XMM, SD),
 209         PackedDoubleAssertion(XMM, XMM, PD),
 210         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 211         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 212 
 213         private final RegisterCategory resultCategory;
 214         private final RegisterCategory inputCategory;
 215         private final OperandSize[] allowedSizes;
 216 
 217         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 218             this.resultCategory = resultCategory;
 219             this.inputCategory = inputCategory;
 220             this.allowedSizes = allowedSizes;
 221         }
 222 
 223         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 224             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 225             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 226 
 227             for (OperandSize s : allowedSizes) {
 228                 if (size == s) {
 229                     return true;
 230                 }
 231             }
 232 
 233             assert false : "invalid operand size " + size + " used in " + op;
 234             return false;
 235         }
 236 
 237     }
 238 
 239     protected static final int P_0F = 0x0F;
 240     protected static final int P_0F38 = 0x380F;
 241     protected static final int P_0F3A = 0x3A0F;
 242 
 243     /**
 244      * Base class for AMD64 opcodes.
 245      */
 246     public static class AMD64Op {
 247 
 248         private final String opcode;
 249 
 250         protected final int prefix1;
 251         protected final int prefix2;
 252         protected final int op;
 253 
 254         private final boolean dstIsByte;
 255         private final boolean srcIsByte;
 256 
 257         private final OpAssertion assertion;
 258         private final CPUFeature feature;
 259 
 260         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 261             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 262         }
 263 
 264         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 265             this.opcode = opcode;
 266             this.prefix1 = prefix1;
 267             this.prefix2 = prefix2;
 268             this.op = op;
 269 
 270             this.dstIsByte = dstIsByte;
 271             this.srcIsByte = srcIsByte;
 272 
 273             this.assertion = assertion;
 274             this.feature = feature;
 275         }
 276 
 277         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 278             if (prefix1 != 0) {
 279                 asm.emitByte(prefix1);
 280             }
 281             if (size.getSizePrefix() != 0) {
 282                 asm.emitByte(size.getSizePrefix());
 283             }
 284             int rexPrefix = 0x40 | rxb;
 285             if (size == QWORD) {
 286                 rexPrefix |= 0x08;
 287             }
 288             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 289                 asm.emitByte(rexPrefix);
 290             }
 291             if (prefix2 > 0xFF) {
 292                 asm.emitShort(prefix2);
 293             } else if (prefix2 > 0) {
 294                 asm.emitByte(prefix2);
 295             }
 296             asm.emitByte(op);
 297         }
 298 
 299         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 300             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 301             assert assertion.checkOperands(this, size, resultReg, inputReg);
 302             return true;
 303         }
 304 
 305         public OperandSize[] getAllowedSizes() {
 306             return assertion.allowedSizes;
 307         }
 308 
 309         protected final boolean isSSEInstruction() {
 310             if (feature == null) {
 311                 return false;
 312             }
 313             switch (feature) {
 314                 case SSE:
 315                 case SSE2:
 316                 case SSE3:
 317                 case SSSE3:
 318                 case SSE4A:
 319                 case SSE4_1:
 320                 case SSE4_2:
 321                     return true;
 322                 default:
 323                     return false;
 324             }
 325         }
 326 
 327         public final OpAssertion getAssertion() {
 328             return assertion;
 329         }
 330 
 331         @Override
 332         public String toString() {
 333             return opcode;
 334         }
 335     }
 336 
 337     /**
 338      * Base class for AMD64 opcodes with immediate operands.
 339      */
 340     public static class AMD64ImmOp extends AMD64Op {
 341 
 342         private final boolean immIsByte;
 343 
 344         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 345             this(opcode, immIsByte, prefix, op, assertion, null);
 346         }
 347 
 348         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 349             super(opcode, 0, prefix, op, assertion, feature);
 350             this.immIsByte = immIsByte;
 351         }
 352 
 353         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 354             if (immIsByte) {
 355                 assert imm == (byte) imm;
 356                 asm.emitByte(imm);
 357             } else {
 358                 size.emitImmediate(asm, imm);
 359             }
 360         }
 361 
 362         protected final int immediateSize(OperandSize size) {
 363             if (immIsByte) {
 364                 return 1;
 365             } else {
 366                 return size.getBytes();
 367             }
 368         }
 369     }
 370 
 371     /**
 372      * Opcode with operand order of either RM or MR for 2 address forms.
 373      */
 374     public abstract static class AMD64RROp extends AMD64Op {
 375 
 376         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 377             super(opcode, prefix1, prefix2, op, assertion, feature);
 378         }
 379 
 380         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 381             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 382         }
 383 
 384         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 385     }
 386 
 387     /**
 388      * Opcode with operand order of RM.
 389      */
 390     public static class AMD64RMOp extends AMD64RROp {
 391         // @formatter:off
 392         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 393         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 394         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 395         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 396         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 397         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 398         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 399         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 400         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 401         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 402         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 403         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 404         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 405         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 406         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 407         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 408         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 409 
 410         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 411         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 412         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 413         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 414         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 415 
 416         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 417         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 418         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 419         // @formatter:on
 420 
 421         protected AMD64RMOp(String opcode, int op) {
 422             this(opcode, 0, op);
 423         }
 424 
 425         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 426             this(opcode, 0, op, assertion);
 427         }
 428 
 429         protected AMD64RMOp(String opcode, int prefix, int op) {
 430             this(opcode, 0, prefix, op, null);
 431         }
 432 
 433         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 434             this(opcode, 0, prefix, op, assertion, null);
 435         }
 436 
 437         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 438             this(opcode, 0, prefix, op, assertion, feature);
 439         }
 440 
 441         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 442             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 443         }
 444 
 445         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 446             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 447         }
 448 
 449         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 450             super(opcode, prefix1, prefix2, op, assertion, feature);
 451         }
 452 
 453         @Override
 454         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 455             assert verify(asm, size, dst, src);
 456             if (isSSEInstruction()) {
 457                 Register nds = Register.None;
 458                 switch (op) {
 459                     case 0x10:
 460                     case 0x51:
 461                         if ((size == SS) || (size == SD)) {
 462                             nds = dst;
 463                         }
 464                         break;
 465                     case 0x2A:
 466                     case 0x54:
 467                     case 0x55:
 468                     case 0x56:
 469                     case 0x57:
 470                     case 0x58:
 471                     case 0x59:
 472                     case 0x5A:
 473                     case 0x5C:
 474                     case 0x5D:
 475                     case 0x5E:
 476                     case 0x5F:
 477                         nds = dst;
 478                         break;
 479                     default:
 480                         break;
 481                 }
 482                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 483                 asm.emitByte(op);
 484                 asm.emitModRM(dst, src);
 485             } else {
 486                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 487                 asm.emitModRM(dst, src);
 488             }
 489         }
 490 
 491         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 492             assert verify(asm, size, dst, null);
 493             if (isSSEInstruction()) {
 494                 Register nds = Register.None;
 495                 switch (op) {
 496                     case 0x51:
 497                         if ((size == SS) || (size == SD)) {
 498                             nds = dst;
 499                         }
 500                         break;
 501                     case 0x2A:
 502                     case 0x54:
 503                     case 0x55:
 504                     case 0x56:
 505                     case 0x57:
 506                     case 0x58:
 507                     case 0x59:
 508                     case 0x5A:
 509                     case 0x5C:
 510                     case 0x5D:
 511                     case 0x5E:
 512                     case 0x5F:
 513                         nds = dst;
 514                         break;
 515                     default:
 516                         break;
 517                 }
 518                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 519                 asm.emitByte(op);
 520                 asm.emitOperandHelper(dst, src, 0);
 521             } else {
 522                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 523                 asm.emitOperandHelper(dst, src, 0);
 524             }
 525         }
 526     }
 527 
 528     /**
 529      * Opcode with operand order of MR.
 530      */
 531     public static class AMD64MROp extends AMD64RROp {
 532         // @formatter:off
 533         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 534         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 535 
 536         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 537         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 538         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 539         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 540 
 541         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 542         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 543         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 544         // @formatter:on
 545 
 546         protected AMD64MROp(String opcode, int op) {
 547             this(opcode, 0, op);
 548         }
 549 
 550         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 551             this(opcode, 0, op, assertion);
 552         }
 553 
 554         protected AMD64MROp(String opcode, int prefix, int op) {
 555             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 556         }
 557 
 558         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 559             this(opcode, prefix, op, assertion, null);
 560         }
 561 
 562         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 563             this(opcode, 0, prefix, op, assertion, feature);
 564         }
 565 
 566         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 567             super(opcode, prefix1, prefix2, op, assertion, feature);
 568         }
 569 
 570         @Override
 571         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 572             assert verify(asm, size, src, dst);
 573             if (isSSEInstruction()) {
 574                 Register nds = Register.None;
 575                 switch (op) {
 576                     case 0x11:
 577                         if ((size == SS) || (size == SD)) {
 578                             nds = src;
 579                         }
 580                         break;
 581                     default:
 582                         break;
 583                 }
 584                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 585                 asm.emitByte(op);
 586                 asm.emitModRM(src, dst);
 587             } else {
 588                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 589                 asm.emitModRM(src, dst);
 590             }
 591         }
 592 
 593         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 594             assert verify(asm, size, src, null);
 595             if (isSSEInstruction()) {
 596                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 597                 asm.emitByte(op);
 598             } else {
 599                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 600             }
 601             asm.emitOperandHelper(src, dst, 0);
 602         }
 603     }
 604 
 605     /**
 606      * Opcodes with operand order of M.
 607      */
 608     public static class AMD64MOp extends AMD64Op {
 609         // @formatter:off
 610         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 611         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 612         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 613         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 614         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 615         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 616         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 617         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 618         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 619         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 620         // @formatter:on
 621 
 622         private final int ext;
 623 
 624         protected AMD64MOp(String opcode, int op, int ext) {
 625             this(opcode, 0, op, ext);
 626         }
 627 
 628         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 629             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 630         }
 631 
 632         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 633             this(opcode, 0, op, ext, assertion);
 634         }
 635 
 636         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 637             super(opcode, 0, prefix, op, assertion, null);
 638             this.ext = ext;
 639         }
 640 
 641         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 642             assert verify(asm, size, dst, null);
 643             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 644             asm.emitModRM(ext, dst);
 645         }
 646 
 647         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 648             assert verify(asm, size, null, null);
 649             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 650             asm.emitOperandHelper(ext, dst, 0);
 651         }
 652     }
 653 
 654     /**
 655      * Opcodes with operand order of MI.
 656      */
 657     public static class AMD64MIOp extends AMD64ImmOp {
 658         // @formatter:off
 659         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 660         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 661         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 662         // @formatter:on
 663 
 664         private final int ext;
 665 
 666         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 667             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 668         }
 669 
 670         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 671             this(opcode, immIsByte, 0, op, ext, assertion);
 672         }
 673 
 674         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 675             super(opcode, immIsByte, prefix, op, assertion);
 676             this.ext = ext;
 677         }
 678 
 679         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 680             emit(asm, size, dst, imm, false);
 681         }
 682 
 683         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 684             assert verify(asm, size, dst, null);
 685             int insnPos = asm.position();
 686             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 687             asm.emitModRM(ext, dst);
 688             int immPos = asm.position();
 689             emitImmediate(asm, size, imm);
 690             int nextInsnPos = asm.position();
 691             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 692                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 693             }
 694         }
 695 
 696         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 697             emit(asm, size, dst, imm, false);
 698         }
 699 
 700         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 701             assert verify(asm, size, null, null);
 702             int insnPos = asm.position();
 703             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 704             asm.emitOperandHelper(ext, dst, immediateSize(size));
 705             int immPos = asm.position();
 706             emitImmediate(asm, size, imm);
 707             int nextInsnPos = asm.position();
 708             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 709                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 710             }
 711         }
 712     }
 713 
 714     /**
 715      * Opcodes with operand order of RMI.
 716      *
 717      * We only have one form of round as the operation is always treated with single variant input,
 718      * making its extension to 3 address forms redundant.
 719      */
 720     public static class AMD64RMIOp extends AMD64ImmOp {
 721         // @formatter:off
 722         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 723         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 724         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 725         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 726         // @formatter:on
 727 
 728         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 729             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 730         }
 731 
 732         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 733             super(opcode, immIsByte, prefix, op, assertion, feature);
 734         }
 735 
 736         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 737             assert verify(asm, size, dst, src);
 738             if (isSSEInstruction()) {
 739                 Register nds = Register.None;
 740                 switch (op) {
 741                     case 0x0A:
 742                     case 0x0B:
 743                         nds = dst;
 744                         break;
 745                     default:
 746                         break;
 747                 }
 748                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 749                 asm.emitByte(op);
 750                 asm.emitModRM(dst, src);
 751             } else {
 752                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 753                 asm.emitModRM(dst, src);
 754             }
 755             emitImmediate(asm, size, imm);
 756         }
 757 
 758         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 759             assert verify(asm, size, dst, null);
 760             if (isSSEInstruction()) {
 761                 Register nds = Register.None;
 762                 switch (op) {
 763                     case 0x0A:
 764                     case 0x0B:
 765                         nds = dst;
 766                         break;
 767                     default:
 768                         break;
 769                 }
 770                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 771                 asm.emitByte(op);
 772             } else {
 773                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 774             }
 775             asm.emitOperandHelper(dst, src, immediateSize(size));
 776             emitImmediate(asm, size, imm);
 777         }
 778     }
 779 
 780     public static class SSEOp extends AMD64RMOp {
 781         // @formatter:off
 782         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 783         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 784         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 785         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 786         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 787         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 788         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 789         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 790         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 791         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 792         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 793         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 794         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 795         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 796         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 797         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 798         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 799         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 800         // @formatter:on
 801 
 802         protected SSEOp(String opcode, int prefix, int op) {
 803             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 804         }
 805 
 806         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 807             this(opcode, 0, prefix, op, assertion);
 808         }
 809 
 810         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 811             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 812         }
 813     }
 814 
 815     /**
 816      * Arithmetic operation with operand order of RM, MR or MI.
 817      */
 818     public static final class AMD64BinaryArithmetic {
 819         // @formatter:off
 820         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 821         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 822         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 823         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 824         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 825         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 826         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 827         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 828         // @formatter:on
 829 
 830         private final AMD64MIOp byteImmOp;
 831         private final AMD64MROp byteMrOp;
 832         private final AMD64RMOp byteRmOp;
 833 
 834         private final AMD64MIOp immOp;
 835         private final AMD64MIOp immSxOp;
 836         private final AMD64MROp mrOp;
 837         private final AMD64RMOp rmOp;
 838 
 839         private AMD64BinaryArithmetic(String opcode, int code) {
 840             int baseOp = code << 3;
 841 
 842             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 843             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 844             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 845 
 846             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 847             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 848             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 849             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 850         }
 851 
 852         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 853             if (size == BYTE) {
 854                 return byteImmOp;
 855             } else if (sx) {
 856                 return immSxOp;
 857             } else {
 858                 return immOp;
 859             }
 860         }
 861 
 862         public AMD64MROp getMROpcode(OperandSize size) {
 863             if (size == BYTE) {
 864                 return byteMrOp;
 865             } else {
 866                 return mrOp;
 867             }
 868         }
 869 
 870         public AMD64RMOp getRMOpcode(OperandSize size) {
 871             if (size == BYTE) {
 872                 return byteRmOp;
 873             } else {
 874                 return rmOp;
 875             }
 876         }
 877     }
 878 
 879     /**
 880      * Shift operation with operand order of M1, MC or MI.
 881      */
 882     public static final class AMD64Shift {
 883         // @formatter:off
 884         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 885         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 886         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 887         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 888         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 889         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 890         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 891         // @formatter:on
 892 
 893         public final AMD64MOp m1Op;
 894         public final AMD64MOp mcOp;
 895         public final AMD64MIOp miOp;
 896 
 897         private AMD64Shift(String opcode, int code) {
 898             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 899             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 900             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 901         }
 902     }
 903 
 904     private enum EVEXFeatureAssertion {
 905         AVX512F_ALL(EnumSet.of(AVX512F), EnumSet.of(AVX512F), EnumSet.of(AVX512F)),
 906         AVX512F_128ONLY(EnumSet.of(AVX512F), null, null),
 907         AVX512F_VL(EnumSet.of(AVX512F, AVX512VL), EnumSet.of(AVX512F, AVX512VL), EnumSet.of(AVX512F)),
 908         AVX512CD_VL(EnumSet.of(AVX512F, AVX512CD, AVX512VL), EnumSet.of(AVX512F, AVX512CD, AVX512VL), EnumSet.of(AVX512F, AVX512CD)),
 909         AVX512DQ_VL(EnumSet.of(AVX512F, AVX512DQ, AVX512VL), EnumSet.of(AVX512F, AVX512DQ, AVX512VL), EnumSet.of(AVX512F, AVX512DQ)),
 910         AVX512BW_VL(EnumSet.of(AVX512F, AVX512BW, AVX512VL), EnumSet.of(AVX512F, AVX512BW, AVX512VL), EnumSet.of(AVX512F, AVX512BW));
 911 
 912         private final EnumSet<CPUFeature> l128features;
 913         private final EnumSet<CPUFeature> l256features;
 914         private final EnumSet<CPUFeature> l512features;
 915 
 916         EVEXFeatureAssertion(EnumSet<CPUFeature> l128features, EnumSet<CPUFeature> l256features, EnumSet<CPUFeature> l512features) {
 917             this.l128features = l128features;
 918             this.l256features = l256features;
 919             this.l512features = l512features;
 920         }
 921 
 922         public boolean check(AMD64 arch, int l) {
 923             switch (l) {
 924                 case L128:
 925                     assert l128features != null && arch.getFeatures().containsAll(l128features) : "emitting illegal 128 bit instruction";
 926                     break;
 927                 case L256:
 928                     assert l256features != null && arch.getFeatures().containsAll(l256features) : "emitting illegal 256 bit instruction";
 929                     break;
 930                 case L512:
 931                     assert l512features != null && arch.getFeatures().containsAll(l512features) : "emitting illegal 512 bit instruction";
 932                     break;
 933             }
 934             return true;
 935         }
 936 
 937         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 938             switch (avxSize) {
 939                 case XMM:
 940                     return l128features != null && features.containsAll(l128features);
 941                 case YMM:
 942                     return l256features != null && features.containsAll(l256features);
 943                 case ZMM:
 944                     return l512features != null && features.containsAll(l512features);
 945                 default:
 946                     throw GraalError.shouldNotReachHere();
 947             }
 948         }
 949     }
 950 
 951     private enum VEXOpAssertion {
 952         AVX1(CPUFeature.AVX, CPUFeature.AVX, null),
 953         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2, null),
 954         AVX2(CPUFeature.AVX2, CPUFeature.AVX2, null),
 955         AVX1_128ONLY(CPUFeature.AVX, null, null),
 956         AVX1_256ONLY(null, CPUFeature.AVX, null),
 957         AVX2_256ONLY(null, CPUFeature.AVX2, null),
 958         XMM_CPU(CPUFeature.AVX, null, null, XMM, null, CPU, null),
 959         XMM_XMM_CPU(CPUFeature.AVX, null, null, XMM, XMM, CPU, null),
 960         CPU_XMM(CPUFeature.AVX, null, null, CPU, null, XMM, null),
 961         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, null, CPU, null, XMM, null),
 962         BMI1(CPUFeature.BMI1, null, null, CPU, CPU, CPU, null),
 963         BMI2(CPUFeature.BMI2, null, null, CPU, CPU, CPU, null),
 964         FMA(CPUFeature.FMA, null, null, XMM, XMM, XMM, null),
 965 
 966         XMM_CPU_AVX512F_128ONLY(CPUFeature.AVX, null, EVEXFeatureAssertion.AVX512F_128ONLY, XMM, null, CPU, null),
 967         AVX1_AVX512F_ALL(CPUFeature.AVX, CPUFeature.AVX, EVEXFeatureAssertion.AVX512F_ALL),
 968         AVX1_AVX512F_VL(CPUFeature.AVX, CPUFeature.AVX, EVEXFeatureAssertion.AVX512F_VL);
 969 
 970         private final CPUFeature l128feature;
 971         private final CPUFeature l256feature;
 972         private final EVEXFeatureAssertion l512features;
 973 
 974         private final RegisterCategory rCategory;
 975         private final RegisterCategory vCategory;
 976         private final RegisterCategory mCategory;
 977         private final RegisterCategory imm8Category;
 978 
 979         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, EVEXFeatureAssertion l512features) {
 980             this(l128feature, l256feature, l512features, XMM, XMM, XMM, XMM);
 981         }
 982 
 983         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, EVEXFeatureAssertion l512features, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory,
 984                         RegisterCategory imm8Category) {
 985             this.l128feature = l128feature;
 986             this.l256feature = l256feature;
 987             this.l512features = l512features;
 988             this.rCategory = rCategory;
 989             this.vCategory = vCategory;
 990             this.mCategory = mCategory;
 991             this.imm8Category = imm8Category;
 992         }
 993 
 994         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 995             return check(arch, getLFlag(size), r, v, m, null);
 996         }
 997 
 998         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 999             return check(arch, getLFlag(size), r, v, m, imm8);
1000         }
1001 
1002         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
1003             if (isAVX512Register(r) || isAVX512Register(v) || isAVX512Register(m) || l == L512) {
1004                 assert l512features != null && l512features.check(arch, l);
1005             } else if (l == L128) {
1006                 assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
1007             } else if (l == L256) {
1008                 assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
1009             }
1010             if (r != null) {
1011                 assert r.getRegisterCategory().equals(rCategory);
1012             }
1013             if (v != null) {
1014                 assert v.getRegisterCategory().equals(vCategory);
1015             }
1016             if (m != null) {
1017                 assert m.getRegisterCategory().equals(mCategory);
1018             }
1019             if (imm8 != null) {
1020                 assert imm8.getRegisterCategory().equals(imm8Category);
1021             }
1022             return true;
1023         }
1024 
1025         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize, boolean useZMMRegisters) {
1026             if (useZMMRegisters || avxSize == AVXSize.ZMM) {
1027                 return l512features != null && l512features.supports(features, avxSize);
1028             } else if (avxSize == AVXSize.XMM) {
1029                 return l128feature != null && features.contains(l128feature);
1030             } else if (avxSize == AVXSize.YMM) {
1031                 return l256feature != null && features.contains(l256feature);
1032             }
1033             throw GraalError.shouldNotReachHere();
1034         }
1035     }
1036 
1037     /**
1038      * Base class for VEX-encoded instructions.
1039      */
1040     public static class VexOp {
1041         protected final int pp;
1042         protected final int mmmmm;
1043         protected final int w;
1044         protected final int op;
1045 
1046         private final String opcode;
1047         protected final VEXOpAssertion assertion;
1048 
1049         protected final EVEXTuple evexTuple;
1050         protected final int wEvex;
1051 
1052         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion, EVEXTuple evexTuple, int wEvex) {
1053             this.pp = pp;
1054             this.mmmmm = mmmmm;
1055             this.w = w;
1056             this.op = op;
1057             this.opcode = opcode;
1058             this.assertion = assertion;
1059             this.evexTuple = evexTuple;
1060             this.wEvex = wEvex;
1061         }
1062 
1063         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1064             this(opcode, pp, mmmmm, w, op, assertion, EVEXTuple.INVALID, WIG);
1065         }
1066 
1067         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
1068             return isSupported(vasm, size, false);
1069         }
1070 
1071         public final boolean isSupported(AMD64Assembler vasm, AVXSize size, boolean useZMMRegisters) {
1072             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size, useZMMRegisters);
1073         }
1074 
1075         @Override
1076         public String toString() {
1077             return opcode;
1078         }
1079 
1080         protected final int getDisp8Scale(boolean useEvex, AVXSize size) {
1081             return useEvex ? evexTuple.getDisp8ScalingFactor(size) : DEFAULT_DISP8_SCALE;
1082         }
1083 
1084     }
1085 
1086     /**
1087      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1088      */
1089     public static class VexRROp extends VexOp {
1090         // @formatter:off
1091         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY, EVEXTuple.INVALID, WIG);
1092         // @formatter:on
1093 
1094         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion, EVEXTuple evexTuple, int wEvex) {
1095             super(opcode, pp, mmmmm, w, op, assertion, evexTuple, wEvex);
1096         }
1097 
1098         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1099             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1100             assert op != 0x1A || op != 0x5A;
1101             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, wEvex, false);
1102             asm.emitByte(op);
1103             asm.emitModRM(dst, src);
1104         }
1105     }
1106 
1107     /**
1108      * VEX-encoded instructions with an operand order of RM.
1109      */
1110     public static class VexRMOp extends VexRROp {
1111         // @formatter:off
1112         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1113         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1114         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1115         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1116         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1117         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1118         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1119         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1120         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1121         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1122         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1123         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1124         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1125         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1126         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1127         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1128         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1129         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1130         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1131         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1132         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1133         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1134         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1135         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1136         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1137         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1138         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1139         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1140         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1141         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1142         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1143         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1144         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1145         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1146         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1147         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1148         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1149         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1150         // @formatter:on
1151 
1152         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1153             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1, EVEXTuple.INVALID, WIG);
1154         }
1155 
1156         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1157             this(opcode, pp, mmmmm, w, op, assertion, EVEXTuple.INVALID, WIG);
1158         }
1159 
1160         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion, EVEXTuple evexTuple, int wEvex) {
1161             super(opcode, pp, mmmmm, w, op, assertion, evexTuple, wEvex);
1162         }
1163 
1164         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1165             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1166             boolean useEvex = asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, wEvex, false);
1167             asm.emitByte(op);
1168             asm.emitOperandHelper(dst, src, 0, getDisp8Scale(useEvex, size));
1169         }
1170     }
1171 
1172     /**
1173      * VEX-encoded move instructions.
1174      * <p>
1175      * These instructions have two opcodes: op is the forward move instruction with an operand order
1176      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1177      */
1178     public static final class VexMoveOp extends VexRMOp {
1179         // @formatter:off
1180         public static final VexMoveOp VMOVDQA32 = new VexMoveOp("VMOVDQA32", P_66, M_0F, WIG, 0x6F, 0x7F, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W0);
1181         public static final VexMoveOp VMOVDQA64 = new VexMoveOp("VMOVDQA64", P_66, M_0F, WIG, 0x6F, 0x7F, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W1);
1182         public static final VexMoveOp VMOVDQU32 = new VexMoveOp("VMOVDQU32", P_F3, M_0F, WIG, 0x6F, 0x7F, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W0);
1183         public static final VexMoveOp VMOVDQU64 = new VexMoveOp("VMOVDQU64", P_F3, M_0F, WIG, 0x6F, 0x7F, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W1);
1184         public static final VexMoveOp VMOVAPS   = new VexMoveOp("VMOVAPS",   P_,   M_0F, WIG, 0x28, 0x29, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W0);
1185         public static final VexMoveOp VMOVAPD   = new VexMoveOp("VMOVAPD",   P_66, M_0F, WIG, 0x28, 0x29, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W1);
1186         public static final VexMoveOp VMOVUPS   = new VexMoveOp("VMOVUPS",   P_,   M_0F, WIG, 0x10, 0x11, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W0);
1187         public static final VexMoveOp VMOVUPD   = new VexMoveOp("VMOVUPD",   P_66, M_0F, WIG, 0x10, 0x11, VEXOpAssertion.AVX1_AVX512F_VL,         EVEXTuple.FVM,       W1);
1188         public static final VexMoveOp VMOVSS    = new VexMoveOp("VMOVSS",    P_F3, M_0F, WIG, 0x10, 0x11, VEXOpAssertion.AVX1_AVX512F_ALL,        EVEXTuple.T1S_32BIT, W0);
1189         public static final VexMoveOp VMOVSD    = new VexMoveOp("VMOVSD",    P_F2, M_0F, WIG, 0x10, 0x11, VEXOpAssertion.AVX1_AVX512F_ALL,        EVEXTuple.T1S_64BIT, W1);
1190         public static final VexMoveOp VMOVD     = new VexMoveOp("VMOVD",     P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU_AVX512F_128ONLY, EVEXTuple.T1F_32BIT, W0);
1191         public static final VexMoveOp VMOVQ     = new VexMoveOp("VMOVQ",     P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU_AVX512F_128ONLY, EVEXTuple.T1S_64BIT, W1);
1192         // @formatter:on
1193 
1194         private final int opReverse;
1195 
1196         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1197             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1, EVEXTuple.INVALID, WIG);
1198         }
1199 
1200         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1201             this(opcode, pp, mmmmm, w, op, opReverse, assertion, EVEXTuple.INVALID, WIG);
1202         }
1203 
1204         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion, EVEXTuple evexTuple, int wEvex) {
1205             super(opcode, pp, mmmmm, w, op, assertion, evexTuple, wEvex);
1206             this.opReverse = opReverse;
1207         }
1208 
1209         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1210             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1211             boolean useEvex = asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, wEvex, false);
1212             asm.emitByte(opReverse);
1213             asm.emitOperandHelper(src, dst, 0, getDisp8Scale(useEvex, size));
1214         }
1215 
1216         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1217             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1218             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, wEvex, false);
1219             asm.emitByte(opReverse);
1220             asm.emitModRM(src, dst);
1221         }
1222     }
1223 
1224     public interface VexRRIOp {
1225         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1226     }
1227 
1228     /**
1229      * VEX-encoded instructions with an operand order of RMI.
1230      */
1231     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1232         // @formatter:off
1233         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1234         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1235         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1236         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1237         // @formatter:on
1238 
1239         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1240             super(opcode, pp, mmmmm, w, op, assertion);
1241         }
1242 
1243         @Override
1244         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1245             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1246             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, wEvex, false);
1247             asm.emitByte(op);
1248             asm.emitModRM(dst, src);
1249             asm.emitByte(imm8);
1250         }
1251 
1252         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1253             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1254             boolean useEvex = asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, wEvex, false);
1255             asm.emitByte(op);
1256             asm.emitOperandHelper(dst, src, 1, getDisp8Scale(useEvex, size));
1257             asm.emitByte(imm8);
1258         }
1259     }
1260 
1261     /**
1262      * VEX-encoded instructions with an operand order of MRI.
1263      */
1264     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1265         // @formatter:off
1266         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1267         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1268         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1269         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1270         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1271         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1272         // @formatter:on
1273 
1274         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1275             super(opcode, pp, mmmmm, w, op, assertion);
1276         }
1277 
1278         @Override
1279         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1280             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1281             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, wEvex, false);
1282             asm.emitByte(op);
1283             asm.emitModRM(src, dst);
1284             asm.emitByte(imm8);
1285         }
1286 
1287         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1288             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1289             boolean useEvex = asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, wEvex, false);
1290             asm.emitByte(op);
1291             asm.emitOperandHelper(src, dst, 1, getDisp8Scale(useEvex, size));
1292             asm.emitByte(imm8);
1293         }
1294     }
1295 
1296     /**
1297      * VEX-encoded instructions with an operand order of RVMR.
1298      */
1299     public static class VexRVMROp extends VexOp {
1300         // @formatter:off
1301         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1302         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1303         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1304         // @formatter:on
1305 
1306         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1307             super(opcode, pp, mmmmm, w, op, assertion);
1308         }
1309 
1310         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1311             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1312             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1313             asm.emitByte(op);
1314             asm.emitModRM(dst, src2);
1315             asm.emitByte(mask.encoding() << 4);
1316         }
1317 
1318         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1319             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1320             boolean useEvex = asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1321             asm.emitByte(op);
1322             asm.emitOperandHelper(dst, src2, 0, getDisp8Scale(useEvex, size));
1323             asm.emitByte(mask.encoding() << 4);
1324         }
1325     }
1326 
1327     /**
1328      * VEX-encoded instructions with an operand order of RVM.
1329      */
1330     public static class VexRVMOp extends VexOp {
1331         // @formatter:off
1332         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1333         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1334         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1335         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1336         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1337         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1338         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1339         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1340         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1341         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1342         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1343         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1344         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1345         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1346         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1347         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1348         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1349         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1350         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1351         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1352         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1353         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1354         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1355         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1356         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1357         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1358         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1359         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1360         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1361         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1362         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1363         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1364         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1365         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1366         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1367         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1368         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1369         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1370         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1371         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1372         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1373         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1374         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1375         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1376         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1377         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1378         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1379         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1380         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1381         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1382         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1383         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1384         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1385         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1386         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1387         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1388         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1389         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1390         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1391         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1392         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1393         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1394         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1395         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1396         public static final VexRVMOp VFMADD231SS = new VexRVMOp("VFMADD231SS", P_66, M_0F38, W0, 0xB9, VEXOpAssertion.FMA);
1397         public static final VexRVMOp VFMADD231SD = new VexRVMOp("VFMADD231SD", P_66, M_0F38, W1, 0xB9, VEXOpAssertion.FMA);
1398         // @formatter:on
1399 
1400         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1401             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1402         }
1403 
1404         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1405             super(opcode, pp, mmmmm, w, op, assertion);
1406         }
1407 
1408         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1409             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1410             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1411             asm.emitByte(op);
1412             asm.emitModRM(dst, src2);
1413         }
1414 
1415         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1416             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1417             boolean useEvex = asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1418             asm.emitByte(op);
1419             asm.emitOperandHelper(dst, src2, 0, getDisp8Scale(useEvex, size));
1420         }
1421     }
1422 
1423     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1424         // @formatter:off
1425         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1426         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1427         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1428         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1429         // @formatter:on
1430 
1431         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1432             super(opcode, pp, mmmmm, w, op, assertion);
1433         }
1434 
1435         @Override
1436         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1437             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1438             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1439             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, wEvex, false);
1440             asm.emitByte(op);
1441             asm.emitModRM(dst, src2);
1442         }
1443 
1444         @Override
1445         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1446             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1447             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1448             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, wEvex, false);
1449             asm.emitByte(op);
1450             asm.emitOperandHelper(dst, src2, 0);
1451         }
1452     }
1453 
1454     public static final class VexGeneralPurposeRMVOp extends VexOp {
1455         // @formatter:off
1456         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1457         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1458         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1459         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1460         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1461         // @formatter:on
1462 
1463         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1464             super(opcode, pp, mmmmm, w, op, assertion);
1465         }
1466 
1467         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1468             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1469             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1470             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, wEvex, false);
1471             asm.emitByte(op);
1472             asm.emitModRM(dst, src1);
1473         }
1474 
1475         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1476             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1477             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1478             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, wEvex, false);
1479             asm.emitByte(op);
1480             asm.emitOperandHelper(dst, src1, 0);
1481         }
1482     }
1483 
1484     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1485         // @formatter:off
1486         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1487         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1488         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1489         // @formatter:on
1490         private final int ext;
1491 
1492         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1493             super(opcode, pp, mmmmm, w, op, assertion);
1494             this.ext = ext;
1495         }
1496 
1497         @Override
1498         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1499             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1500             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, wEvex, false);
1501             asm.emitByte(op);
1502             asm.emitModRM(ext, src);
1503         }
1504 
1505         @Override
1506         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1507             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1508             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, wEvex, false);
1509             asm.emitByte(op);
1510             asm.emitOperandHelper(ext, src, 0);
1511         }
1512     }
1513 
1514     /**
1515      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1516      */
1517     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1518         // @formatter:off
1519         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1520         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1521         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1522         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1523         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1524         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1525         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1526         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1527         // @formatter:on
1528 
1529         private final int immOp;
1530         private final int r;
1531 
1532         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1533             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1534             this.immOp = immOp;
1535             this.r = r;
1536         }
1537 
1538         @Override
1539         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1540             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1541             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, wEvex, false);
1542             asm.emitByte(immOp);
1543             asm.emitModRM(r, src);
1544             asm.emitByte(imm8);
1545         }
1546     }
1547 
1548     public static final class VexMaskMoveOp extends VexOp {
1549         // @formatter:off
1550         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1551         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1552         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1553         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1554         // @formatter:on
1555 
1556         private final int opReverse;
1557 
1558         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1559             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1560         }
1561 
1562         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1563             super(opcode, pp, mmmmm, w, op, assertion);
1564             this.opReverse = opReverse;
1565         }
1566 
1567         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1568             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1569             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, wEvex, false);
1570             asm.emitByte(op);
1571             asm.emitOperandHelper(dst, src, 0);
1572         }
1573 
1574         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1575             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1576             boolean useEvex = asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, wEvex, false);
1577             asm.emitByte(opReverse);
1578             asm.emitOperandHelper(src, dst, 0, getDisp8Scale(useEvex, size));
1579         }
1580     }
1581 
1582     /**
1583      * VEX-encoded instructions with an operand order of RVMI.
1584      */
1585     public static final class VexRVMIOp extends VexOp {
1586         // @formatter:off
1587         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1588         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1589         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1590         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1591         // @formatter:on
1592 
1593         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1594             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1595         }
1596 
1597         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1598             super(opcode, pp, mmmmm, w, op, assertion);
1599         }
1600 
1601         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1602             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1603             assert (imm8 & 0xFF) == imm8;
1604             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1605             asm.emitByte(op);
1606             asm.emitModRM(dst, src2);
1607             asm.emitByte(imm8);
1608         }
1609 
1610         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1611             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1612             assert (imm8 & 0xFF) == imm8;
1613             boolean useEvex = asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1614             asm.emitByte(op);
1615             asm.emitOperandHelper(dst, src2, 1, getDisp8Scale(useEvex, size));
1616             asm.emitByte(imm8);
1617         }
1618     }
1619 
1620     /**
1621      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1622      * comparison operator.
1623      */
1624     public static final class VexFloatCompareOp extends VexOp {
1625         // @formatter:off
1626         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1627         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1628         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1629         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1630         // @formatter:on
1631 
1632         public enum Predicate {
1633             EQ_OQ(0x00),
1634             LT_OS(0x01),
1635             LE_OS(0x02),
1636             UNORD_Q(0x03),
1637             NEQ_UQ(0x04),
1638             NLT_US(0x05),
1639             NLE_US(0x06),
1640             ORD_Q(0x07),
1641             EQ_UQ(0x08),
1642             NGE_US(0x09),
1643             NGT_US(0x0a),
1644             FALSE_OQ(0x0b),
1645             NEQ_OQ(0x0c),
1646             GE_OS(0x0d),
1647             GT_OS(0x0e),
1648             TRUE_UQ(0x0f),
1649             EQ_OS(0x10),
1650             LT_OQ(0x11),
1651             LE_OQ(0x12),
1652             UNORD_S(0x13),
1653             NEQ_US(0x14),
1654             NLT_UQ(0x15),
1655             NLE_UQ(0x16),
1656             ORD_S(0x17),
1657             EQ_US(0x18),
1658             NGE_UQ(0x19),
1659             NGT_UQ(0x1a),
1660             FALSE_OS(0x1b),
1661             NEQ_OS(0x1c),
1662             GE_OQ(0x1d),
1663             GT_OQ(0x1e),
1664             TRUE_US(0x1f);
1665 
1666             private int imm8;
1667 
1668             Predicate(int imm8) {
1669                 this.imm8 = imm8;
1670             }
1671 
1672             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1673                 if (unorderedIsTrue) {
1674                     switch (condition) {
1675                         case EQ:
1676                             return EQ_UQ;
1677                         case NE:
1678                             return NEQ_UQ;
1679                         case LT:
1680                             return NGE_UQ;
1681                         case LE:
1682                             return NGT_UQ;
1683                         case GT:
1684                             return NLE_UQ;
1685                         case GE:
1686                             return NLT_UQ;
1687                         default:
1688                             throw GraalError.shouldNotReachHere();
1689                     }
1690                 } else {
1691                     switch (condition) {
1692                         case EQ:
1693                             return EQ_OQ;
1694                         case NE:
1695                             return NEQ_OQ;
1696                         case LT:
1697                             return LT_OQ;
1698                         case LE:
1699                             return LE_OQ;
1700                         case GT:
1701                             return GT_OQ;
1702                         case GE:
1703                             return GE_OQ;
1704                         default:
1705                             throw GraalError.shouldNotReachHere();
1706                     }
1707                 }
1708             }
1709         }
1710 
1711         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1712             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1713         }
1714 
1715         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1716             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1717             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1718             asm.emitByte(op);
1719             asm.emitModRM(dst, src2);
1720             asm.emitByte(p.imm8);
1721         }
1722 
1723         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1724             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1725             boolean useEvex = asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, wEvex, false);
1726             asm.emitByte(op);
1727             asm.emitOperandHelper(dst, src2, 1, getDisp8Scale(useEvex, size));
1728             asm.emitByte(p.imm8);
1729         }
1730     }
1731 
1732     public final void addl(AMD64Address dst, int imm32) {
1733         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1734     }
1735 
1736     public final void addl(Register dst, int imm32) {
1737         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1738     }
1739 
1740     public final void addl(Register dst, Register src) {
1741         ADD.rmOp.emit(this, DWORD, dst, src);
1742     }
1743 
1744     public final void addpd(Register dst, Register src) {
1745         SSEOp.ADD.emit(this, PD, dst, src);
1746     }
1747 
1748     public final void addpd(Register dst, AMD64Address src) {
1749         SSEOp.ADD.emit(this, PD, dst, src);
1750     }
1751 
1752     public final void addsd(Register dst, Register src) {
1753         SSEOp.ADD.emit(this, SD, dst, src);
1754     }
1755 
1756     public final void addsd(Register dst, AMD64Address src) {
1757         SSEOp.ADD.emit(this, SD, dst, src);
1758     }
1759 
1760     private void addrNop4() {
1761         // 4 bytes: NOP DWORD PTR [EAX+0]
1762         emitByte(0x0F);
1763         emitByte(0x1F);
1764         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1765         emitByte(0); // 8-bits offset (1 byte)
1766     }
1767 
1768     private void addrNop5() {
1769         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1770         emitByte(0x0F);
1771         emitByte(0x1F);
1772         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1773         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1774         emitByte(0); // 8-bits offset (1 byte)
1775     }
1776 
1777     private void addrNop7() {
1778         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1779         emitByte(0x0F);
1780         emitByte(0x1F);
1781         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1782         emitInt(0); // 32-bits offset (4 bytes)
1783     }
1784 
1785     private void addrNop8() {
1786         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1787         emitByte(0x0F);
1788         emitByte(0x1F);
1789         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1790         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1791         emitInt(0); // 32-bits offset (4 bytes)
1792     }
1793 
1794     public final void andl(Register dst, int imm32) {
1795         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1796     }
1797 
1798     public final void andl(Register dst, Register src) {
1799         AND.rmOp.emit(this, DWORD, dst, src);
1800     }
1801 
1802     public final void andpd(Register dst, Register src) {
1803         SSEOp.AND.emit(this, PD, dst, src);
1804     }
1805 
1806     public final void andpd(Register dst, AMD64Address src) {
1807         SSEOp.AND.emit(this, PD, dst, src);
1808     }
1809 
1810     public final void bsfq(Register dst, Register src) {
1811         prefixq(dst, src);
1812         emitByte(0x0F);
1813         emitByte(0xBC);
1814         emitModRM(dst, src);
1815     }
1816 
1817     public final void bsrl(Register dst, Register src) {
1818         prefix(dst, src);
1819         emitByte(0x0F);
1820         emitByte(0xBD);
1821         emitModRM(dst, src);
1822     }
1823 
1824     public final void bswapl(Register reg) {
1825         prefix(reg);
1826         emitByte(0x0F);
1827         emitModRM(1, reg);
1828     }
1829 
1830     public final void cdql() {
1831         emitByte(0x99);
1832     }
1833 
1834     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1835         prefix(dst, src);
1836         emitByte(0x0F);
1837         emitByte(0x40 | cc.getValue());
1838         emitModRM(dst, src);
1839     }
1840 
1841     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1842         prefix(src, dst);
1843         emitByte(0x0F);
1844         emitByte(0x40 | cc.getValue());
1845         emitOperandHelper(dst, src, 0);
1846     }
1847 
1848     public final void cmpb(Register dst, Register src) {
1849         CMP.byteRmOp.emit(this, BYTE, dst, src);
1850     }
1851 
1852     public final void cmpw(Register dst, Register src) {
1853         CMP.rmOp.emit(this, WORD, dst, src);
1854     }
1855 
1856     public final void cmpl(Register dst, int imm32) {
1857         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1858     }
1859 
1860     public final void cmpl(Register dst, Register src) {
1861         CMP.rmOp.emit(this, DWORD, dst, src);
1862     }
1863 
1864     public final void cmpl(Register dst, AMD64Address src) {
1865         CMP.rmOp.emit(this, DWORD, dst, src);
1866     }
1867 
1868     public final void cmpl(AMD64Address dst, int imm32) {
1869         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1870     }
1871 
1872     /**
1873      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1874      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1875      * values were equal, and cleared otherwise.
1876      */
1877     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1878         prefixb(adr, reg);
1879         emitByte(0x0F);
1880         emitByte(0xB0);
1881         emitOperandHelper(reg, adr, 0);
1882     }
1883 
1884     /**
1885      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1886      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1887      * compared values were equal, and cleared otherwise.
1888      */
1889     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1890         emitByte(0x66); // Switch to 16-bit mode.
1891         prefix(adr, reg);
1892         emitByte(0x0F);
1893         emitByte(0xB1);
1894         emitOperandHelper(reg, adr, 0);
1895     }
1896 
1897     /**
1898      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1899      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1900      * compared values were equal, and cleared otherwise.
1901      */
1902     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1903         prefix(adr, reg);
1904         emitByte(0x0F);
1905         emitByte(0xB1);
1906         emitOperandHelper(reg, adr, 0);
1907     }
1908 
1909     public final void cvtsi2sdl(Register dst, Register src) {
1910         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1911     }
1912 
1913     public final void cvttsd2sil(Register dst, Register src) {
1914         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1915     }
1916 
1917     public final void decl(AMD64Address dst) {
1918         prefix(dst);
1919         emitByte(0xFF);
1920         emitOperandHelper(1, dst, 0);
1921     }
1922 
1923     public final void divsd(Register dst, Register src) {
1924         SSEOp.DIV.emit(this, SD, dst, src);
1925     }
1926 
1927     public final void hlt() {
1928         emitByte(0xF4);
1929     }
1930 
1931     public final void imull(Register dst, Register src, int value) {
1932         if (isByte(value)) {
1933             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1934         } else {
1935             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1936         }
1937     }
1938 
1939     public final void incl(AMD64Address dst) {
1940         prefix(dst);
1941         emitByte(0xFF);
1942         emitOperandHelper(0, dst, 0);
1943     }
1944 
1945     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1946         int shortSize = 2;
1947         int longSize = 6;
1948         long disp = jumpTarget - position();
1949         if (!forceDisp32 && isByte(disp - shortSize)) {
1950             // 0111 tttn #8-bit disp
1951             emitByte(0x70 | cc.getValue());
1952             emitByte((int) ((disp - shortSize) & 0xFF));
1953         } else {
1954             // 0000 1111 1000 tttn #32-bit disp
1955             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1956             emitByte(0x0F);
1957             emitByte(0x80 | cc.getValue());
1958             emitInt((int) (disp - longSize));
1959         }
1960     }
1961 
1962     public final void jcc(ConditionFlag cc, Label l) {
1963         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1964         if (l.isBound()) {
1965             jcc(cc, l.position(), false);
1966         } else {
1967             // Note: could eliminate cond. jumps to this jump if condition
1968             // is the same however, seems to be rather unlikely case.
1969             // Note: use jccb() if label to be bound is very close to get
1970             // an 8-bit displacement
1971             l.addPatchAt(position(), this);
1972             emitByte(0x0F);
1973             emitByte(0x80 | cc.getValue());
1974             emitInt(0);
1975         }
1976 
1977     }
1978 
1979     public final void jccb(ConditionFlag cc, Label l) {
1980         if (l.isBound()) {
1981             int shortSize = 2;
1982             int entry = l.position();
1983             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1984             long disp = entry - position();
1985             // 0111 tttn #8-bit disp
1986             emitByte(0x70 | cc.getValue());
1987             emitByte((int) ((disp - shortSize) & 0xFF));
1988         } else {
1989             l.addPatchAt(position(), this);
1990             emitByte(0x70 | cc.getValue());
1991             emitByte(0);
1992         }
1993     }
1994 
1995     public final void jmp(int jumpTarget, boolean forceDisp32) {
1996         int shortSize = 2;
1997         int longSize = 5;
1998         long disp = jumpTarget - position();
1999         if (!forceDisp32 && isByte(disp - shortSize)) {
2000             emitByte(0xEB);
2001             emitByte((int) ((disp - shortSize) & 0xFF));
2002         } else {
2003             emitByte(0xE9);
2004             emitInt((int) (disp - longSize));
2005         }
2006     }
2007 
2008     @Override
2009     public final void jmp(Label l) {
2010         if (l.isBound()) {
2011             jmp(l.position(), false);
2012         } else {
2013             // By default, forward jumps are always 32-bit displacements, since
2014             // we can't yet know where the label will be bound. If you're sure that
2015             // the forward jump will not run beyond 256 bytes, use jmpb to
2016             // force an 8-bit displacement.
2017 
2018             l.addPatchAt(position(), this);
2019             emitByte(0xE9);
2020             emitInt(0);
2021         }
2022     }
2023 
2024     public final void jmp(Register entry) {
2025         prefix(entry);
2026         emitByte(0xFF);
2027         emitModRM(4, entry);
2028     }
2029 
2030     public final void jmp(AMD64Address adr) {
2031         prefix(adr);
2032         emitByte(0xFF);
2033         emitOperandHelper(AMD64.rsp, adr, 0);
2034     }
2035 
2036     public final void jmpb(Label l) {
2037         if (l.isBound()) {
2038             int shortSize = 2;
2039             // Displacement is relative to byte just after jmpb instruction
2040             int displacement = l.position() - position() - shortSize;
2041             GraalError.guarantee(isByte(displacement), "Displacement too large to be encoded as a byte: %d", displacement);
2042             emitByte(0xEB);
2043             emitByte(displacement & 0xFF);
2044         } else {
2045             l.addPatchAt(position(), this);
2046             emitByte(0xEB);
2047             emitByte(0);
2048         }
2049     }
2050 
2051     public final void lead(Register dst, AMD64Address src) {
2052         prefix(src, dst);
2053         emitByte(0x8D);
2054         emitOperandHelper(dst, src, 0);
2055     }
2056 
2057     public final void leaq(Register dst, AMD64Address src) {
2058         prefixq(src, dst);
2059         emitByte(0x8D);
2060         emitOperandHelper(dst, src, 0);
2061     }
2062 
2063     public final void leave() {
2064         emitByte(0xC9);
2065     }
2066 
2067     public final void lock() {
2068         emitByte(0xF0);
2069     }
2070 
2071     public final void movapd(Register dst, Register src) {
2072         assert inRC(XMM, dst) && inRC(XMM, src);
2073         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2074         emitByte(0x28);
2075         emitModRM(dst, src);
2076     }
2077 
2078     public final void movaps(Register dst, Register src) {
2079         assert inRC(XMM, dst) && inRC(XMM, src);
2080         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2081         emitByte(0x28);
2082         emitModRM(dst, src);
2083     }
2084 
2085     public final void movb(AMD64Address dst, int imm8) {
2086         prefix(dst);
2087         emitByte(0xC6);
2088         emitOperandHelper(0, dst, 1);
2089         emitByte(imm8);
2090     }
2091 
2092     public final void movb(AMD64Address dst, Register src) {
2093         assert inRC(CPU, src) : "must have byte register";
2094         prefixb(dst, src);
2095         emitByte(0x88);
2096         emitOperandHelper(src, dst, 0);
2097     }
2098 
2099     public final void movl(Register dst, int imm32) {
2100         movl(dst, imm32, false);
2101     }
2102 
2103     public final void movl(Register dst, int imm32, boolean annotateImm) {
2104         int insnPos = position();
2105         prefix(dst);
2106         emitByte(0xB8 + encode(dst));
2107         int immPos = position();
2108         emitInt(imm32);
2109         int nextInsnPos = position();
2110         if (annotateImm && codePatchingAnnotationConsumer != null) {
2111             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2112         }
2113     }
2114 
2115     public final void movl(Register dst, Register src) {
2116         prefix(dst, src);
2117         emitByte(0x8B);
2118         emitModRM(dst, src);
2119     }
2120 
2121     public final void movl(Register dst, AMD64Address src) {
2122         prefix(src, dst);
2123         emitByte(0x8B);
2124         emitOperandHelper(dst, src, 0);
2125     }
2126 
2127     /**
2128      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2129      */
2130     public final void movl(Register dst, AMD64Address src, boolean wide) {
2131         prefix(src, dst);
2132         emitByte(0x8B);
2133         emitOperandHelper(dst, src, wide, 0);
2134     }
2135 
2136     public final void movl(AMD64Address dst, int imm32) {
2137         prefix(dst);
2138         emitByte(0xC7);
2139         emitOperandHelper(0, dst, 4);
2140         emitInt(imm32);
2141     }
2142 
2143     public final void movl(AMD64Address dst, Register src) {
2144         prefix(dst, src);
2145         emitByte(0x89);
2146         emitOperandHelper(src, dst, 0);
2147     }
2148 
2149     /**
2150      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2151      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2152      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2153      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2154      */
2155     public final void movlpd(Register dst, AMD64Address src) {
2156         assert inRC(XMM, dst);
2157         simdPrefix(dst, dst, src, PD, P_0F, false);
2158         emitByte(0x12);
2159         emitOperandHelper(dst, src, 0);
2160     }
2161 
2162     public final void movlhps(Register dst, Register src) {
2163         assert inRC(XMM, dst) && inRC(XMM, src);
2164         simdPrefix(dst, src, src, PS, P_0F, false);
2165         emitByte(0x16);
2166         emitModRM(dst, src);
2167     }
2168 
2169     public final void movq(Register dst, AMD64Address src) {
2170         movq(dst, src, false);
2171     }
2172 
2173     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2174         if (inRC(XMM, dst)) {
2175             // Insn: MOVQ xmm, r/m64
2176             // Code: F3 0F 7E /r
2177             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2178             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2179             // when applicable.
2180             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2181             emitByte(0x7E);
2182             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2183         } else {
2184             // gpr version of movq
2185             prefixq(src, dst);
2186             emitByte(0x8B);
2187             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2188         }
2189     }
2190 
2191     public final void movq(Register dst, Register src) {
2192         assert inRC(CPU, dst) && inRC(CPU, src);
2193         prefixq(dst, src);
2194         emitByte(0x8B);
2195         emitModRM(dst, src);
2196     }
2197 
2198     public final void movq(AMD64Address dst, Register src) {
2199         if (inRC(XMM, src)) {
2200             // Insn: MOVQ r/m64, xmm
2201             // Code: 66 0F D6 /r
2202             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2203             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2204             // when applicable.
2205             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2206             emitByte(0xD6);
2207             emitOperandHelper(src, dst, 0);
2208         } else {
2209             // gpr version of movq
2210             prefixq(dst, src);
2211             emitByte(0x89);
2212             emitOperandHelper(src, dst, 0);
2213         }
2214     }
2215 
2216     public final void movsbl(Register dst, AMD64Address src) {
2217         prefix(src, dst);
2218         emitByte(0x0F);
2219         emitByte(0xBE);
2220         emitOperandHelper(dst, src, 0);
2221     }
2222 
2223     public final void movsbl(Register dst, Register src) {
2224         prefix(dst, false, src, true);
2225         emitByte(0x0F);
2226         emitByte(0xBE);
2227         emitModRM(dst, src);
2228     }
2229 
2230     public final void movsbq(Register dst, AMD64Address src) {
2231         prefixq(src, dst);
2232         emitByte(0x0F);
2233         emitByte(0xBE);
2234         emitOperandHelper(dst, src, 0);
2235     }
2236 
2237     public final void movsbq(Register dst, Register src) {
2238         prefixq(dst, src);
2239         emitByte(0x0F);
2240         emitByte(0xBE);
2241         emitModRM(dst, src);
2242     }
2243 
2244     public final void movsd(Register dst, Register src) {
2245         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2246     }
2247 
2248     public final void movsd(Register dst, AMD64Address src) {
2249         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2250     }
2251 
2252     public final void movsd(AMD64Address dst, Register src) {
2253         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2254     }
2255 
2256     public final void movss(Register dst, Register src) {
2257         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2258     }
2259 
2260     public final void movss(Register dst, AMD64Address src) {
2261         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2262     }
2263 
2264     public final void movss(AMD64Address dst, Register src) {
2265         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2266     }
2267 
2268     public final void mulpd(Register dst, Register src) {
2269         SSEOp.MUL.emit(this, PD, dst, src);
2270     }
2271 
2272     public final void mulpd(Register dst, AMD64Address src) {
2273         SSEOp.MUL.emit(this, PD, dst, src);
2274     }
2275 
2276     public final void mulsd(Register dst, Register src) {
2277         SSEOp.MUL.emit(this, SD, dst, src);
2278     }
2279 
2280     public final void mulsd(Register dst, AMD64Address src) {
2281         SSEOp.MUL.emit(this, SD, dst, src);
2282     }
2283 
2284     public final void mulss(Register dst, Register src) {
2285         SSEOp.MUL.emit(this, SS, dst, src);
2286     }
2287 
2288     public final void movswl(Register dst, AMD64Address src) {
2289         AMD64RMOp.MOVSX.emit(this, DWORD, dst, src);
2290     }
2291 
2292     public final void movswq(Register dst, AMD64Address src) {
2293         AMD64RMOp.MOVSX.emit(this, QWORD, dst, src);
2294     }
2295 
2296     public final void movw(AMD64Address dst, int imm16) {
2297         emitByte(0x66); // switch to 16-bit mode
2298         prefix(dst);
2299         emitByte(0xC7);
2300         emitOperandHelper(0, dst, 2);
2301         emitShort(imm16);
2302     }
2303 
2304     public final void movw(AMD64Address dst, Register src) {
2305         emitByte(0x66);
2306         prefix(dst, src);
2307         emitByte(0x89);
2308         emitOperandHelper(src, dst, 0);
2309     }
2310 
2311     public final void movw(Register dst, AMD64Address src) {
2312         emitByte(0x66);
2313         prefix(src, dst);
2314         emitByte(0x8B);
2315         emitOperandHelper(dst, src, 0);
2316     }
2317 
2318     public final void movzbl(Register dst, AMD64Address src) {
2319         prefix(src, dst);
2320         emitByte(0x0F);
2321         emitByte(0xB6);
2322         emitOperandHelper(dst, src, 0);
2323     }
2324 
2325     public final void movzbl(Register dst, Register src) {
2326         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2327     }
2328 
2329     public final void movzbq(Register dst, Register src) {
2330         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2331     }
2332 
2333     public final void movzbq(Register dst, AMD64Address src) {
2334         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2335     }
2336 
2337     public final void movzwl(Register dst, AMD64Address src) {
2338         AMD64RMOp.MOVZX.emit(this, DWORD, dst, src);
2339     }
2340 
2341     public final void movzwq(Register dst, AMD64Address src) {
2342         AMD64RMOp.MOVZX.emit(this, QWORD, dst, src);
2343     }
2344 
2345     public final void negl(Register dst) {
2346         NEG.emit(this, DWORD, dst);
2347     }
2348 
2349     public final void notl(Register dst) {
2350         NOT.emit(this, DWORD, dst);
2351     }
2352 
2353     public final void notq(Register dst) {
2354         NOT.emit(this, QWORD, dst);
2355     }
2356 
2357     @Override
2358     public final void ensureUniquePC() {
2359         nop();
2360     }
2361 
2362     public final void nop() {
2363         nop(1);
2364     }
2365 
2366     public void nop(int count) {
2367         int i = count;
2368         if (UseNormalNop) {
2369             assert i > 0 : " ";
2370             // The fancy nops aren't currently recognized by debuggers making it a
2371             // pain to disassemble code while debugging. If assert are on clearly
2372             // speed is not an issue so simply use the single byte traditional nop
2373             // to do alignment.
2374 
2375             for (; i > 0; i--) {
2376                 emitByte(0x90);
2377             }
2378             return;
2379         }
2380 
2381         if (UseAddressNop) {
2382             if (UseIntelNops) {
2383                 intelNops(i);
2384             } else {
2385                 amdNops(i);
2386             }
2387             return;
2388         }
2389 
2390         // Using nops with size prefixes "0x66 0x90".
2391         // From AMD Optimization Guide:
2392         // 1: 0x90
2393         // 2: 0x66 0x90
2394         // 3: 0x66 0x66 0x90
2395         // 4: 0x66 0x66 0x66 0x90
2396         // 5: 0x66 0x66 0x90 0x66 0x90
2397         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2398         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2399         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2400         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2401         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2402         //
2403         while (i > 12) {
2404             i -= 4;
2405             emitByte(0x66); // size prefix
2406             emitByte(0x66);
2407             emitByte(0x66);
2408             emitByte(0x90); // nop
2409         }
2410         // 1 - 12 nops
2411         if (i > 8) {
2412             if (i > 9) {
2413                 i -= 1;
2414                 emitByte(0x66);
2415             }
2416             i -= 3;
2417             emitByte(0x66);
2418             emitByte(0x66);
2419             emitByte(0x90);
2420         }
2421         // 1 - 8 nops
2422         if (i > 4) {
2423             if (i > 6) {
2424                 i -= 1;
2425                 emitByte(0x66);
2426             }
2427             i -= 3;
2428             emitByte(0x66);
2429             emitByte(0x66);
2430             emitByte(0x90);
2431         }
2432         switch (i) {
2433             case 4:
2434                 emitByte(0x66);
2435                 emitByte(0x66);
2436                 emitByte(0x66);
2437                 emitByte(0x90);
2438                 break;
2439             case 3:
2440                 emitByte(0x66);
2441                 emitByte(0x66);
2442                 emitByte(0x90);
2443                 break;
2444             case 2:
2445                 emitByte(0x66);
2446                 emitByte(0x90);
2447                 break;
2448             case 1:
2449                 emitByte(0x90);
2450                 break;
2451             default:
2452                 assert i == 0;
2453         }
2454     }
2455 
2456     private void amdNops(int count) {
2457         int i = count;
2458         //
2459         // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2460         // 1: 0x90
2461         // 2: 0x66 0x90
2462         // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2463         // 4: 0x0F 0x1F 0x40 0x00
2464         // 5: 0x0F 0x1F 0x44 0x00 0x00
2465         // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2466         // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2467         // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2468         // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2469         // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2470         // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2471 
2472         // The rest coding is AMD specific - use consecutive Address nops
2473 
2474         // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2475         // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2476         // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2477         // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2478         // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2479         // Size prefixes (0x66) are added for larger sizes
2480 
2481         while (i >= 22) {
2482             i -= 11;
2483             emitByte(0x66); // size prefix
2484             emitByte(0x66); // size prefix
2485             emitByte(0x66); // size prefix
2486             addrNop8();
2487         }
2488         // Generate first nop for size between 21-12
2489         switch (i) {
2490             case 21:
2491                 i -= 11;
2492                 emitByte(0x66); // size prefix
2493                 emitByte(0x66); // size prefix
2494                 emitByte(0x66); // size prefix
2495                 addrNop8();
2496                 break;
2497             case 20:
2498             case 19:
2499                 i -= 10;
2500                 emitByte(0x66); // size prefix
2501                 emitByte(0x66); // size prefix
2502                 addrNop8();
2503                 break;
2504             case 18:
2505             case 17:
2506                 i -= 9;
2507                 emitByte(0x66); // size prefix
2508                 addrNop8();
2509                 break;
2510             case 16:
2511             case 15:
2512                 i -= 8;
2513                 addrNop8();
2514                 break;
2515             case 14:
2516             case 13:
2517                 i -= 7;
2518                 addrNop7();
2519                 break;
2520             case 12:
2521                 i -= 6;
2522                 emitByte(0x66); // size prefix
2523                 addrNop5();
2524                 break;
2525             default:
2526                 assert i < 12;
2527         }
2528 
2529         // Generate second nop for size between 11-1
2530         switch (i) {
2531             case 11:
2532                 emitByte(0x66); // size prefix
2533                 emitByte(0x66); // size prefix
2534                 emitByte(0x66); // size prefix
2535                 addrNop8();
2536                 break;
2537             case 10:
2538                 emitByte(0x66); // size prefix
2539                 emitByte(0x66); // size prefix
2540                 addrNop8();
2541                 break;
2542             case 9:
2543                 emitByte(0x66); // size prefix
2544                 addrNop8();
2545                 break;
2546             case 8:
2547                 addrNop8();
2548                 break;
2549             case 7:
2550                 addrNop7();
2551                 break;
2552             case 6:
2553                 emitByte(0x66); // size prefix
2554                 addrNop5();
2555                 break;
2556             case 5:
2557                 addrNop5();
2558                 break;
2559             case 4:
2560                 addrNop4();
2561                 break;
2562             case 3:
2563                 // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2564                 emitByte(0x66); // size prefix
2565                 emitByte(0x66); // size prefix
2566                 emitByte(0x90); // nop
2567                 break;
2568             case 2:
2569                 emitByte(0x66); // size prefix
2570                 emitByte(0x90); // nop
2571                 break;
2572             case 1:
2573                 emitByte(0x90); // nop
2574                 break;
2575             default:
2576                 assert i == 0;
2577         }
2578     }
2579 
2580     @SuppressWarnings("fallthrough")
2581     private void intelNops(int count) {
2582         //
2583         // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
2584         // 1: 0x90
2585         // 2: 0x66 0x90
2586         // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2587         // 4: 0x0F 0x1F 0x40 0x00
2588         // 5: 0x0F 0x1F 0x44 0x00 0x00
2589         // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2590         // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2591         // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2592         // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2593         // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2594         // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2595 
2596         // The rest coding is Intel specific - don't use consecutive address nops
2597 
2598         // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2599         // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2600         // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2601         // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2602 
2603         int i = count;
2604         while (i >= 15) {
2605             // For Intel don't generate consecutive addess nops (mix with regular nops)
2606             i -= 15;
2607             emitByte(0x66);   // size prefix
2608             emitByte(0x66);   // size prefix
2609             emitByte(0x66);   // size prefix
2610             addrNop8();
2611             emitByte(0x66);   // size prefix
2612             emitByte(0x66);   // size prefix
2613             emitByte(0x66);   // size prefix
2614             emitByte(0x90);
2615             // nop
2616         }
2617         switch (i) {
2618             case 14:
2619                 emitByte(0x66); // size prefix
2620                 // fall through
2621             case 13:
2622                 emitByte(0x66); // size prefix
2623                 // fall through
2624             case 12:
2625                 addrNop8();
2626                 emitByte(0x66); // size prefix
2627                 emitByte(0x66); // size prefix
2628                 emitByte(0x66); // size prefix
2629                 emitByte(0x90);
2630                 // nop
2631                 break;
2632             case 11:
2633                 emitByte(0x66); // size prefix
2634                 // fall through
2635             case 10:
2636                 emitByte(0x66); // size prefix
2637                 // fall through
2638             case 9:
2639                 emitByte(0x66); // size prefix
2640                 // fall through
2641             case 8:
2642                 addrNop8();
2643                 break;
2644             case 7:
2645                 addrNop7();
2646                 break;
2647             case 6:
2648                 emitByte(0x66); // size prefix
2649                 // fall through
2650             case 5:
2651                 addrNop5();
2652                 break;
2653             case 4:
2654                 addrNop4();
2655                 break;
2656             case 3:
2657                 // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2658                 emitByte(0x66); // size prefix
2659                 // fall through
2660             case 2:
2661                 emitByte(0x66); // size prefix
2662                 // fall through
2663             case 1:
2664                 emitByte(0x90);
2665                 // nop
2666                 break;
2667             default:
2668                 assert i == 0;
2669         }
2670     }
2671 
2672     public final void orl(Register dst, Register src) {
2673         OR.rmOp.emit(this, DWORD, dst, src);
2674     }
2675 
2676     public final void orl(Register dst, int imm32) {
2677         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2678     }
2679 
2680     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2681     // -----
2682     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2683 
2684     public final void packuswb(Register dst, Register src) {
2685         assert inRC(XMM, dst) && inRC(XMM, src);
2686         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2687         simdPrefix(dst, dst, src, PD, P_0F, false);
2688         emitByte(0x67);
2689         emitModRM(dst, src);
2690     }
2691 
2692     public final void pop(Register dst) {
2693         prefix(dst);
2694         emitByte(0x58 + encode(dst));
2695     }
2696 
2697     public void popfq() {
2698         emitByte(0x9D);
2699     }
2700 
2701     public final void ptest(Register dst, Register src) {
2702         assert supports(CPUFeature.SSE4_1);
2703         assert inRC(XMM, dst) && inRC(XMM, src);
2704         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2705         emitByte(0x17);
2706         emitModRM(dst, src);
2707     }
2708 
2709     public final void pcmpeqb(Register dst, Register src) {
2710         assert supports(CPUFeature.SSE2);
2711         assert inRC(XMM, dst) && inRC(XMM, src);
2712         simdPrefix(dst, dst, src, PD, P_0F, false);
2713         emitByte(0x74);
2714         emitModRM(dst, src);
2715     }
2716 
2717     public final void pcmpeqw(Register dst, Register src) {
2718         assert supports(CPUFeature.SSE2);
2719         assert inRC(XMM, dst) && inRC(XMM, src);
2720         simdPrefix(dst, dst, src, PD, P_0F, false);
2721         emitByte(0x75);
2722         emitModRM(dst, src);
2723     }
2724 
2725     public final void pcmpeqd(Register dst, Register src) {
2726         assert supports(CPUFeature.SSE2);
2727         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2728         simdPrefix(dst, dst, src, PD, P_0F, false);
2729         emitByte(0x76);
2730         emitModRM(dst, src);
2731     }
2732 
2733     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2734         assert supports(CPUFeature.SSE4_2);
2735         assert inRC(XMM, dst);
2736         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2737         emitByte(0x61);
2738         emitOperandHelper(dst, src, 0);
2739         emitByte(imm8);
2740     }
2741 
2742     public final void pcmpestri(Register dst, Register src, int imm8) {
2743         assert supports(CPUFeature.SSE4_2);
2744         assert inRC(XMM, dst) && inRC(XMM, src);
2745         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2746         emitByte(0x61);
2747         emitModRM(dst, src);
2748         emitByte(imm8);
2749     }
2750 
2751     public final void pmovmskb(Register dst, Register src) {
2752         assert supports(CPUFeature.SSE2);
2753         assert inRC(CPU, dst) && inRC(XMM, src);
2754         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2755         emitByte(0xD7);
2756         emitModRM(dst, src);
2757     }
2758 
2759     private void pmovSZx(Register dst, AMD64Address src, int op) {
2760         assert supports(CPUFeature.SSE4_1);
2761         assert inRC(XMM, dst);
2762         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2763         emitByte(op);
2764         emitOperandHelper(dst, src, 0);
2765     }
2766 
2767     public final void pmovsxbw(Register dst, AMD64Address src) {
2768         pmovSZx(dst, src, 0x20);
2769     }
2770 
2771     public final void pmovsxbd(Register dst, AMD64Address src) {
2772         pmovSZx(dst, src, 0x21);
2773     }
2774 
2775     public final void pmovsxbq(Register dst, AMD64Address src) {
2776         pmovSZx(dst, src, 0x22);
2777     }
2778 
2779     public final void pmovsxwd(Register dst, AMD64Address src) {
2780         pmovSZx(dst, src, 0x23);
2781     }
2782 
2783     public final void pmovsxwq(Register dst, AMD64Address src) {
2784         pmovSZx(dst, src, 0x24);
2785     }
2786 
2787     public final void pmovsxdq(Register dst, AMD64Address src) {
2788         pmovSZx(dst, src, 0x25);
2789     }
2790 
2791     // Insn: VPMOVZXBW xmm1, xmm2/m64
2792     public final void pmovzxbw(Register dst, AMD64Address src) {
2793         pmovSZx(dst, src, 0x30);
2794     }
2795 
2796     public final void pmovzxbd(Register dst, AMD64Address src) {
2797         pmovSZx(dst, src, 0x31);
2798     }
2799 
2800     public final void pmovzxbq(Register dst, AMD64Address src) {
2801         pmovSZx(dst, src, 0x32);
2802     }
2803 
2804     public final void pmovzxwd(Register dst, AMD64Address src) {
2805         pmovSZx(dst, src, 0x33);
2806     }
2807 
2808     public final void pmovzxwq(Register dst, AMD64Address src) {
2809         pmovSZx(dst, src, 0x34);
2810     }
2811 
2812     public final void pmovzxdq(Register dst, AMD64Address src) {
2813         pmovSZx(dst, src, 0x35);
2814     }
2815 
2816     public final void pmovzxbw(Register dst, Register src) {
2817         assert supports(CPUFeature.SSE4_1);
2818         assert inRC(XMM, dst) && inRC(XMM, src);
2819         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2820         emitByte(0x30);
2821         emitModRM(dst, src);
2822     }
2823 
2824     public final void push(Register src) {
2825         prefix(src);
2826         emitByte(0x50 + encode(src));
2827     }
2828 
2829     public void pushfq() {
2830         emitByte(0x9c);
2831     }
2832 
2833     public final void paddd(Register dst, Register src) {
2834         assert inRC(XMM, dst) && inRC(XMM, src);
2835         simdPrefix(dst, dst, src, PD, P_0F, false);
2836         emitByte(0xFE);
2837         emitModRM(dst, src);
2838     }
2839 
2840     public final void paddq(Register dst, Register src) {
2841         assert inRC(XMM, dst) && inRC(XMM, src);
2842         simdPrefix(dst, dst, src, PD, P_0F, false);
2843         emitByte(0xD4);
2844         emitModRM(dst, src);
2845     }
2846 
2847     public final void pextrw(Register dst, Register src, int imm8) {
2848         assert inRC(CPU, dst) && inRC(XMM, src);
2849         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2850         emitByte(0xC5);
2851         emitModRM(dst, src);
2852         emitByte(imm8);
2853     }
2854 
2855     public final void pinsrw(Register dst, Register src, int imm8) {
2856         assert inRC(XMM, dst) && inRC(CPU, src);
2857         simdPrefix(dst, dst, src, PD, P_0F, false);
2858         emitByte(0xC4);
2859         emitModRM(dst, src);
2860         emitByte(imm8);
2861     }
2862 
2863     public final void por(Register dst, Register src) {
2864         assert inRC(XMM, dst) && inRC(XMM, src);
2865         simdPrefix(dst, dst, src, PD, P_0F, false);
2866         emitByte(0xEB);
2867         emitModRM(dst, src);
2868     }
2869 
2870     public final void pand(Register dst, Register src) {
2871         assert inRC(XMM, dst) && inRC(XMM, src);
2872         simdPrefix(dst, dst, src, PD, P_0F, false);
2873         emitByte(0xDB);
2874         emitModRM(dst, src);
2875     }
2876 
2877     public final void pxor(Register dst, Register src) {
2878         assert inRC(XMM, dst) && inRC(XMM, src);
2879         simdPrefix(dst, dst, src, PD, P_0F, false);
2880         emitByte(0xEF);
2881         emitModRM(dst, src);
2882     }
2883 
2884     public final void pslld(Register dst, int imm8) {
2885         assert isUByte(imm8) : "invalid value";
2886         assert inRC(XMM, dst);
2887         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2888         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2889         emitByte(0x72);
2890         emitModRM(6, dst);
2891         emitByte(imm8 & 0xFF);
2892     }
2893 
2894     public final void psllq(Register dst, Register shift) {
2895         assert inRC(XMM, dst) && inRC(XMM, shift);
2896         simdPrefix(dst, dst, shift, PD, P_0F, false);
2897         emitByte(0xF3);
2898         emitModRM(dst, shift);
2899     }
2900 
2901     public final void psllq(Register dst, int imm8) {
2902         assert isUByte(imm8) : "invalid value";
2903         assert inRC(XMM, dst);
2904         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2905         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2906         emitByte(0x73);
2907         emitModRM(6, dst);
2908         emitByte(imm8);
2909     }
2910 
2911     public final void psrad(Register dst, int imm8) {
2912         assert isUByte(imm8) : "invalid value";
2913         assert inRC(XMM, dst);
2914         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2915         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2916         emitByte(0x72);
2917         emitModRM(4, dst);
2918         emitByte(imm8);
2919     }
2920 
2921     public final void psrld(Register dst, int imm8) {
2922         assert isUByte(imm8) : "invalid value";
2923         assert inRC(XMM, dst);
2924         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2925         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2926         emitByte(0x72);
2927         emitModRM(2, dst);
2928         emitByte(imm8);
2929     }
2930 
2931     public final void psrlq(Register dst, int imm8) {
2932         assert isUByte(imm8) : "invalid value";
2933         assert inRC(XMM, dst);
2934         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2935         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2936         emitByte(0x73);
2937         emitModRM(2, dst);
2938         emitByte(imm8);
2939     }
2940 
2941     public final void psrldq(Register dst, int imm8) {
2942         assert isUByte(imm8) : "invalid value";
2943         assert inRC(XMM, dst);
2944         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2945         emitByte(0x73);
2946         emitModRM(3, dst);
2947         emitByte(imm8);
2948     }
2949 
2950     public final void pshufb(Register dst, Register src) {
2951         assert supports(CPUFeature.SSSE3);
2952         assert inRC(XMM, dst) && inRC(XMM, src);
2953         simdPrefix(dst, dst, src, PD, P_0F38, false);
2954         emitByte(0x00);
2955         emitModRM(dst, src);
2956     }
2957 
2958     public final void pshuflw(Register dst, Register src, int imm8) {
2959         assert supports(CPUFeature.SSE2);
2960         assert isUByte(imm8) : "invalid value";
2961         assert inRC(XMM, dst) && inRC(XMM, src);
2962         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2963         emitByte(0x70);
2964         emitModRM(dst, src);
2965         emitByte(imm8);
2966     }
2967 
2968     public final void pshufd(Register dst, Register src, int imm8) {
2969         assert isUByte(imm8) : "invalid value";
2970         assert inRC(XMM, dst) && inRC(XMM, src);
2971         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2972         emitByte(0x70);
2973         emitModRM(dst, src);
2974         emitByte(imm8);
2975     }
2976 
2977     public final void psubd(Register dst, Register src) {
2978         assert inRC(XMM, dst) && inRC(XMM, src);
2979         simdPrefix(dst, dst, src, PD, P_0F, false);
2980         emitByte(0xFA);
2981         emitModRM(dst, src);
2982     }
2983 
2984     public final void punpcklbw(Register dst, Register src) {
2985         assert supports(CPUFeature.SSE2);
2986         assert inRC(XMM, dst) && inRC(XMM, src);
2987         simdPrefix(dst, dst, src, PD, P_0F, false);
2988         emitByte(0x60);
2989         emitModRM(dst, src);
2990     }
2991 
2992     public final void rcpps(Register dst, Register src) {
2993         assert inRC(XMM, dst) && inRC(XMM, src);
2994         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2995         emitByte(0x53);
2996         emitModRM(dst, src);
2997     }
2998 
2999     public final void ret(int imm16) {
3000         if (imm16 == 0) {
3001             emitByte(0xC3);
3002         } else {
3003             emitByte(0xC2);
3004             emitShort(imm16);
3005         }
3006     }
3007 
3008     public final void sarl(Register dst, int imm8) {
3009         prefix(dst);
3010         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3011         if (imm8 == 1) {
3012             emitByte(0xD1);
3013             emitModRM(7, dst);
3014         } else {
3015             emitByte(0xC1);
3016             emitModRM(7, dst);
3017             emitByte(imm8);
3018         }
3019     }
3020 
3021     public final void shll(Register dst, int imm8) {
3022         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3023         prefix(dst);
3024         if (imm8 == 1) {
3025             emitByte(0xD1);
3026             emitModRM(4, dst);
3027         } else {
3028             emitByte(0xC1);
3029             emitModRM(4, dst);
3030             emitByte(imm8);
3031         }
3032     }
3033 
3034     public final void shll(Register dst) {
3035         // Multiply dst by 2, CL times.
3036         prefix(dst);
3037         emitByte(0xD3);
3038         emitModRM(4, dst);
3039     }
3040 
3041     // Insn: SHLX r32a, r/m32, r32b
3042 
3043     public final void shlxl(Register dst, Register src1, Register src2) {
3044         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
3045     }
3046 
3047     public final void shrl(Register dst, int imm8) {
3048         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3049         prefix(dst);
3050         emitByte(0xC1);
3051         emitModRM(5, dst);
3052         emitByte(imm8);
3053     }
3054 
3055     public final void shrl(Register dst) {
3056         // Unsigned divide dst by 2, CL times.
3057         prefix(dst);
3058         emitByte(0xD3);
3059         emitModRM(5, dst);
3060     }
3061 
3062     public final void subl(AMD64Address dst, int imm32) {
3063         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
3064     }
3065 
3066     public final void subl(Register dst, int imm32) {
3067         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
3068     }
3069 
3070     public final void subl(Register dst, Register src) {
3071         SUB.rmOp.emit(this, DWORD, dst, src);
3072     }
3073 
3074     public final void subpd(Register dst, Register src) {
3075         SSEOp.SUB.emit(this, PD, dst, src);
3076     }
3077 
3078     public final void subsd(Register dst, Register src) {
3079         SSEOp.SUB.emit(this, SD, dst, src);
3080     }
3081 
3082     public final void subsd(Register dst, AMD64Address src) {
3083         SSEOp.SUB.emit(this, SD, dst, src);
3084     }
3085 
3086     public final void testl(Register dst, int imm32) {
3087         // not using emitArith because test
3088         // doesn't support sign-extension of
3089         // 8bit operands
3090         if (dst.encoding == 0) {
3091             emitByte(0xA9);
3092         } else {
3093             prefix(dst);
3094             emitByte(0xF7);
3095             emitModRM(0, dst);
3096         }
3097         emitInt(imm32);
3098     }
3099 
3100     public final void testl(Register dst, Register src) {
3101         prefix(dst, src);
3102         emitByte(0x85);
3103         emitModRM(dst, src);
3104     }
3105 
3106     public final void testl(Register dst, AMD64Address src) {
3107         prefix(src, dst);
3108         emitByte(0x85);
3109         emitOperandHelper(dst, src, 0);
3110     }
3111 
3112     public final void unpckhpd(Register dst, Register src) {
3113         assert inRC(XMM, dst) && inRC(XMM, src);
3114         simdPrefix(dst, dst, src, PD, P_0F, false);
3115         emitByte(0x15);
3116         emitModRM(dst, src);
3117     }
3118 
3119     public final void unpcklpd(Register dst, Register src) {
3120         assert inRC(XMM, dst) && inRC(XMM, src);
3121         simdPrefix(dst, dst, src, PD, P_0F, false);
3122         emitByte(0x14);
3123         emitModRM(dst, src);
3124     }
3125 
3126     public final void xorl(Register dst, Register src) {
3127         XOR.rmOp.emit(this, DWORD, dst, src);
3128     }
3129 
3130     public final void xorq(Register dst, Register src) {
3131         XOR.rmOp.emit(this, QWORD, dst, src);
3132     }
3133 
3134     public final void xorpd(Register dst, Register src) {
3135         SSEOp.XOR.emit(this, PD, dst, src);
3136     }
3137 
3138     public final void xorps(Register dst, Register src) {
3139         SSEOp.XOR.emit(this, PS, dst, src);
3140     }
3141 
3142     protected final void decl(Register dst) {
3143         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3144         prefix(dst);
3145         emitByte(0xFF);
3146         emitModRM(1, dst);
3147     }
3148 
3149     protected final void incl(Register dst) {
3150         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3151         prefix(dst);
3152         emitByte(0xFF);
3153         emitModRM(0, dst);
3154     }
3155 
3156     public final void addq(Register dst, int imm32) {
3157         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3158     }
3159 
3160     public final void addq(AMD64Address dst, int imm32) {
3161         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3162     }
3163 
3164     public final void addq(Register dst, Register src) {
3165         ADD.rmOp.emit(this, QWORD, dst, src);
3166     }
3167 
3168     public final void addq(AMD64Address dst, Register src) {
3169         ADD.mrOp.emit(this, QWORD, dst, src);
3170     }
3171 
3172     public final void andq(Register dst, int imm32) {
3173         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3174     }
3175 
3176     public final void bsrq(Register dst, Register src) {
3177         prefixq(dst, src);
3178         emitByte(0x0F);
3179         emitByte(0xBD);
3180         emitModRM(dst, src);
3181     }
3182 
3183     public final void bswapq(Register reg) {
3184         prefixq(reg);
3185         emitByte(0x0F);
3186         emitByte(0xC8 + encode(reg));
3187     }
3188 
3189     public final void cdqq() {
3190         rexw();
3191         emitByte(0x99);
3192     }
3193 
3194     public final void repStosb() {
3195         emitByte(0xf3);
3196         rexw();
3197         emitByte(0xaa);
3198     }
3199 
3200     public final void repStosq() {
3201         emitByte(0xf3);
3202         rexw();
3203         emitByte(0xab);
3204     }
3205 
3206     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
3207         prefixq(dst, src);
3208         emitByte(0x0F);
3209         emitByte(0x40 | cc.getValue());
3210         emitModRM(dst, src);
3211     }
3212 
3213     public final void setb(ConditionFlag cc, Register dst) {
3214         prefix(dst, true);
3215         emitByte(0x0F);
3216         emitByte(0x90 | cc.getValue());
3217         emitModRM(0, dst);
3218     }
3219 
3220     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
3221         prefixq(src, dst);
3222         emitByte(0x0F);
3223         emitByte(0x40 | cc.getValue());
3224         emitOperandHelper(dst, src, 0);
3225     }
3226 
3227     public final void cmpq(Register dst, int imm32) {
3228         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3229     }
3230 
3231     public final void cmpq(Register dst, Register src) {
3232         CMP.rmOp.emit(this, QWORD, dst, src);
3233     }
3234 
3235     public final void cmpq(Register dst, AMD64Address src) {
3236         CMP.rmOp.emit(this, QWORD, dst, src);
3237     }
3238 
3239     public final void cmpxchgq(Register reg, AMD64Address adr) {
3240         prefixq(adr, reg);
3241         emitByte(0x0F);
3242         emitByte(0xB1);
3243         emitOperandHelper(reg, adr, 0);
3244     }
3245 
3246     public final void cvtdq2pd(Register dst, Register src) {
3247         assert inRC(XMM, dst) && inRC(XMM, src);
3248         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3249         emitByte(0xE6);
3250         emitModRM(dst, src);
3251     }
3252 
3253     public final void cvtsi2sdq(Register dst, Register src) {
3254         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
3255     }
3256 
3257     public final void cvttsd2siq(Register dst, Register src) {
3258         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
3259     }
3260 
3261     public final void cvttpd2dq(Register dst, Register src) {
3262         assert inRC(XMM, dst) && inRC(XMM, src);
3263         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3264         emitByte(0xE6);
3265         emitModRM(dst, src);
3266     }
3267 
3268     public final void decq(Register dst) {
3269         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3270         prefixq(dst);
3271         emitByte(0xFF);
3272         emitModRM(1, dst);
3273     }
3274 
3275     public final void decq(AMD64Address dst) {
3276         DEC.emit(this, QWORD, dst);
3277     }
3278 
3279     public final void imulq(Register dst, Register src) {
3280         prefixq(dst, src);
3281         emitByte(0x0F);
3282         emitByte(0xAF);
3283         emitModRM(dst, src);
3284     }
3285 
3286     public final void incq(Register dst) {
3287         // Don't use it directly. Use Macroincrementq() instead.
3288         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3289         prefixq(dst);
3290         emitByte(0xFF);
3291         emitModRM(0, dst);
3292     }
3293 
3294     public final void incq(AMD64Address dst) {
3295         INC.emit(this, QWORD, dst);
3296     }
3297 
3298     public final void movq(Register dst, long imm64) {
3299         movq(dst, imm64, false);
3300     }
3301 
3302     public final void movq(Register dst, long imm64, boolean annotateImm) {
3303         int insnPos = position();
3304         prefixq(dst);
3305         emitByte(0xB8 + encode(dst));
3306         int immPos = position();
3307         emitLong(imm64);
3308         int nextInsnPos = position();
3309         if (annotateImm && codePatchingAnnotationConsumer != null) {
3310             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3311         }
3312     }
3313 
3314     public final void movslq(Register dst, int imm32) {
3315         prefixq(dst);
3316         emitByte(0xC7);
3317         emitModRM(0, dst);
3318         emitInt(imm32);
3319     }
3320 
3321     public final void movdq(Register dst, AMD64Address src) {
3322         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3323     }
3324 
3325     public final void movdq(AMD64Address dst, Register src) {
3326         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3327     }
3328 
3329     public final void movdq(Register dst, Register src) {
3330         if (inRC(XMM, dst) && inRC(CPU, src)) {
3331             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3332         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3333             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3334         } else {
3335             throw new InternalError("should not reach here");
3336         }
3337     }
3338 
3339     public final void movdl(Register dst, Register src) {
3340         if (inRC(XMM, dst) && inRC(CPU, src)) {
3341             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3342         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3343             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3344         } else {
3345             throw new InternalError("should not reach here");
3346         }
3347     }
3348 
3349     public final void movdl(Register dst, AMD64Address src) {
3350         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3351     }
3352 
3353     public final void movddup(Register dst, Register src) {
3354         assert supports(CPUFeature.SSE3);
3355         assert inRC(XMM, dst) && inRC(XMM, src);
3356         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3357         emitByte(0x12);
3358         emitModRM(dst, src);
3359     }
3360 
3361     public final void movdqu(Register dst, AMD64Address src) {
3362         assert inRC(XMM, dst);
3363         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3364         emitByte(0x6F);
3365         emitOperandHelper(dst, src, 0);
3366     }
3367 
3368     public final void movdqu(Register dst, Register src) {
3369         assert inRC(XMM, dst) && inRC(XMM, src);
3370         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3371         emitByte(0x6F);
3372         emitModRM(dst, src);
3373     }
3374 
3375     // Insn: VMOVDQU xmm2/m128, xmm1
3376 
3377     public final void movdqu(AMD64Address dst, Register src) {
3378         assert inRC(XMM, src);
3379         // Code: VEX.128.F3.0F.WIG 7F /r
3380         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3381         emitByte(0x7F);
3382         emitOperandHelper(src, dst, 0);
3383     }
3384 
3385     public final void movslq(AMD64Address dst, int imm32) {
3386         prefixq(dst);
3387         emitByte(0xC7);
3388         emitOperandHelper(0, dst, 4);
3389         emitInt(imm32);
3390     }
3391 
3392     public final void movslq(Register dst, AMD64Address src) {
3393         prefixq(src, dst);
3394         emitByte(0x63);
3395         emitOperandHelper(dst, src, 0);
3396     }
3397 
3398     public final void movslq(Register dst, Register src) {
3399         prefixq(dst, src);
3400         emitByte(0x63);
3401         emitModRM(dst, src);
3402     }
3403 
3404     public final void negq(Register dst) {
3405         prefixq(dst);
3406         emitByte(0xF7);
3407         emitModRM(3, dst);
3408     }
3409 
3410     public final void orq(Register dst, Register src) {
3411         OR.rmOp.emit(this, QWORD, dst, src);
3412     }
3413 
3414     public final void shlq(Register dst, int imm8) {
3415         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3416         prefixq(dst);
3417         if (imm8 == 1) {
3418             emitByte(0xD1);
3419             emitModRM(4, dst);
3420         } else {
3421             emitByte(0xC1);
3422             emitModRM(4, dst);
3423             emitByte(imm8);
3424         }
3425     }
3426 
3427     public final void shlq(Register dst) {
3428         // Multiply dst by 2, CL times.
3429         prefixq(dst);
3430         emitByte(0xD3);
3431         emitModRM(4, dst);
3432     }
3433 
3434     public final void shrq(Register dst, int imm8) {
3435         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3436         prefixq(dst);
3437         if (imm8 == 1) {
3438             emitByte(0xD1);
3439             emitModRM(5, dst);
3440         } else {
3441             emitByte(0xC1);
3442             emitModRM(5, dst);
3443             emitByte(imm8);
3444         }
3445     }
3446 
3447     public final void shrq(Register dst) {
3448         prefixq(dst);
3449         emitByte(0xD3);
3450         // Unsigned divide dst by 2, CL times.
3451         emitModRM(5, dst);
3452     }
3453 
3454     public final void sarq(Register dst, int imm8) {
3455         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3456         prefixq(dst);
3457         if (imm8 == 1) {
3458             emitByte(0xD1);
3459             emitModRM(7, dst);
3460         } else {
3461             emitByte(0xC1);
3462             emitModRM(7, dst);
3463             emitByte(imm8);
3464         }
3465     }
3466 
3467     public final void sbbq(Register dst, Register src) {
3468         SBB.rmOp.emit(this, QWORD, dst, src);
3469     }
3470 
3471     public final void subq(Register dst, int imm32) {
3472         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3473     }
3474 
3475     public final void subq(AMD64Address dst, int imm32) {
3476         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3477     }
3478 
3479     public final void subqWide(Register dst, int imm32) {
3480         // don't use the sign-extending version, forcing a 32-bit immediate
3481         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3482     }
3483 
3484     public final void subq(Register dst, Register src) {
3485         SUB.rmOp.emit(this, QWORD, dst, src);
3486     }
3487 
3488     public final void testq(Register dst, Register src) {
3489         prefixq(dst, src);
3490         emitByte(0x85);
3491         emitModRM(dst, src);
3492     }
3493 
3494     public final void btrq(Register src, int imm8) {
3495         prefixq(src);
3496         emitByte(0x0F);
3497         emitByte(0xBA);
3498         emitModRM(6, src);
3499         emitByte(imm8);
3500     }
3501 
3502     public final void xaddb(AMD64Address dst, Register src) {
3503         prefixb(dst, src);
3504         emitByte(0x0F);
3505         emitByte(0xC0);
3506         emitOperandHelper(src, dst, 0);
3507     }
3508 
3509     public final void xaddw(AMD64Address dst, Register src) {
3510         emitByte(0x66); // Switch to 16-bit mode.
3511         prefix(dst, src);
3512         emitByte(0x0F);
3513         emitByte(0xC1);
3514         emitOperandHelper(src, dst, 0);
3515     }
3516 
3517     public final void xaddl(AMD64Address dst, Register src) {
3518         prefix(dst, src);
3519         emitByte(0x0F);
3520         emitByte(0xC1);
3521         emitOperandHelper(src, dst, 0);
3522     }
3523 
3524     public final void xaddq(AMD64Address dst, Register src) {
3525         prefixq(dst, src);
3526         emitByte(0x0F);
3527         emitByte(0xC1);
3528         emitOperandHelper(src, dst, 0);
3529     }
3530 
3531     public final void xchgb(Register dst, AMD64Address src) {
3532         prefixb(src, dst);
3533         emitByte(0x86);
3534         emitOperandHelper(dst, src, 0);
3535     }
3536 
3537     public final void xchgw(Register dst, AMD64Address src) {
3538         emitByte(0x66);
3539         prefix(src, dst);
3540         emitByte(0x87);
3541         emitOperandHelper(dst, src, 0);
3542     }
3543 
3544     public final void xchgl(Register dst, AMD64Address src) {
3545         prefix(src, dst);
3546         emitByte(0x87);
3547         emitOperandHelper(dst, src, 0);
3548     }
3549 
3550     public final void xchgq(Register dst, AMD64Address src) {
3551         prefixq(src, dst);
3552         emitByte(0x87);
3553         emitOperandHelper(dst, src, 0);
3554     }
3555 
3556     public final void membar(int barriers) {
3557         if (target.isMP) {
3558             // We only have to handle StoreLoad
3559             if ((barriers & STORE_LOAD) != 0) {
3560                 // All usable chips support "locked" instructions which suffice
3561                 // as barriers, and are much faster than the alternative of
3562                 // using cpuid instruction. We use here a locked add [rsp],0.
3563                 // This is conveniently otherwise a no-op except for blowing
3564                 // flags.
3565                 // Any change to this code may need to revisit other places in
3566                 // the code where this idiom is used, in particular the
3567                 // orderAccess code.
3568                 lock();
3569                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3570             }
3571         }
3572     }
3573 
3574     @Override
3575     protected final void patchJumpTarget(int branch, int branchTarget) {
3576         int op = getByte(branch);
3577         assert op == 0xE8 // call
3578                         || op == 0x00 // jump table entry
3579                         || op == 0xE9 // jmp
3580                         || op == 0xEB // short jmp
3581                         || (op & 0xF0) == 0x70 // short jcc
3582                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3583         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3584 
3585         if (op == 0x00) {
3586             int offsetToJumpTableBase = getShort(branch + 1);
3587             int jumpTableBase = branch - offsetToJumpTableBase;
3588             int imm32 = branchTarget - jumpTableBase;
3589             emitInt(imm32, branch);
3590         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3591 
3592             // short offset operators (jmp and jcc)
3593             final int imm8 = branchTarget - (branch + 2);
3594             /*
3595              * Since a wrongly patched short branch can potentially lead to working but really bad
3596              * behaving code we should always fail with an exception instead of having an assert.
3597              */
3598             GraalError.guarantee(isByte(imm8), "Displacement too large to be encoded as a byte: %d", imm8);
3599             emitByte(imm8, branch + 1);
3600 
3601         } else {
3602 
3603             int off = 1;
3604             if (op == 0x0F) {
3605                 off = 2;
3606             }
3607 
3608             int imm32 = branchTarget - (branch + 4 + off);
3609             emitInt(imm32, branch + off);
3610         }
3611     }
3612 
3613     public void nullCheck(AMD64Address address) {
3614         testl(AMD64.rax, address);
3615     }
3616 
3617     @Override
3618     public void align(int modulus) {
3619         if (position() % modulus != 0) {
3620             nop(modulus - (position() % modulus));
3621         }
3622     }
3623 
3624     /**
3625      * Emits a direct call instruction. Note that the actual call target is not specified, because
3626      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3627      * responsible to add the call address to the appropriate patching tables.
3628      */
3629     public final void call() {
3630         annotatePatchingImmediate(1, 4);
3631         emitByte(0xE8);
3632         emitInt(0);
3633     }
3634 
3635     public final void call(Register src) {
3636         prefix(src);
3637         emitByte(0xFF);
3638         emitModRM(2, src);
3639     }
3640 
3641     public final void int3() {
3642         emitByte(0xCC);
3643     }
3644 
3645     public final void pause() {
3646         emitByte(0xF3);
3647         emitByte(0x90);
3648     }
3649 
3650     private void emitx87(int b1, int b2, int i) {
3651         assert 0 <= i && i < 8 : "illegal stack offset";
3652         emitByte(b1);
3653         emitByte(b2 + i);
3654     }
3655 
3656     public final void fldd(AMD64Address src) {
3657         emitByte(0xDD);
3658         emitOperandHelper(0, src, 0);
3659     }
3660 
3661     public final void flds(AMD64Address src) {
3662         emitByte(0xD9);
3663         emitOperandHelper(0, src, 0);
3664     }
3665 
3666     public final void fldln2() {
3667         emitByte(0xD9);
3668         emitByte(0xED);
3669     }
3670 
3671     public final void fldlg2() {
3672         emitByte(0xD9);
3673         emitByte(0xEC);
3674     }
3675 
3676     public final void fyl2x() {
3677         emitByte(0xD9);
3678         emitByte(0xF1);
3679     }
3680 
3681     public final void fstps(AMD64Address src) {
3682         emitByte(0xD9);
3683         emitOperandHelper(3, src, 0);
3684     }
3685 
3686     public final void fstpd(AMD64Address src) {
3687         emitByte(0xDD);
3688         emitOperandHelper(3, src, 0);
3689     }
3690 
3691     private void emitFPUArith(int b1, int b2, int i) {
3692         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3693         emitByte(b1);
3694         emitByte(b2 + i);
3695     }
3696 
3697     public void ffree(int i) {
3698         emitFPUArith(0xDD, 0xC0, i);
3699     }
3700 
3701     public void fincstp() {
3702         emitByte(0xD9);
3703         emitByte(0xF7);
3704     }
3705 
3706     public void fxch(int i) {
3707         emitFPUArith(0xD9, 0xC8, i);
3708     }
3709 
3710     public void fnstswAX() {
3711         emitByte(0xDF);
3712         emitByte(0xE0);
3713     }
3714 
3715     public void fwait() {
3716         emitByte(0x9B);
3717     }
3718 
3719     public void fprem() {
3720         emitByte(0xD9);
3721         emitByte(0xF8);
3722     }
3723 
3724     public final void fsin() {
3725         emitByte(0xD9);
3726         emitByte(0xFE);
3727     }
3728 
3729     public final void fcos() {
3730         emitByte(0xD9);
3731         emitByte(0xFF);
3732     }
3733 
3734     public final void fptan() {
3735         emitByte(0xD9);
3736         emitByte(0xF2);
3737     }
3738 
3739     public final void fstp(int i) {
3740         emitx87(0xDD, 0xD8, i);
3741     }
3742 
3743     @Override
3744     public AMD64Address makeAddress(Register base, int displacement) {
3745         return new AMD64Address(base, displacement);
3746     }
3747 
3748     @Override
3749     public AMD64Address getPlaceholder(int instructionStartPosition) {
3750         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3751     }
3752 
3753     private void prefetchPrefix(AMD64Address src) {
3754         prefix(src);
3755         emitByte(0x0F);
3756     }
3757 
3758     public void prefetchnta(AMD64Address src) {
3759         prefetchPrefix(src);
3760         emitByte(0x18);
3761         emitOperandHelper(0, src, 0);
3762     }
3763 
3764     void prefetchr(AMD64Address src) {
3765         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3766         prefetchPrefix(src);
3767         emitByte(0x0D);
3768         emitOperandHelper(0, src, 0);
3769     }
3770 
3771     public void prefetcht0(AMD64Address src) {
3772         assert supports(CPUFeature.SSE);
3773         prefetchPrefix(src);
3774         emitByte(0x18);
3775         emitOperandHelper(1, src, 0);
3776     }
3777 
3778     public void prefetcht1(AMD64Address src) {
3779         assert supports(CPUFeature.SSE);
3780         prefetchPrefix(src);
3781         emitByte(0x18);
3782         emitOperandHelper(2, src, 0);
3783     }
3784 
3785     public void prefetcht2(AMD64Address src) {
3786         assert supports(CPUFeature.SSE);
3787         prefix(src);
3788         emitByte(0x0f);
3789         emitByte(0x18);
3790         emitOperandHelper(3, src, 0);
3791     }
3792 
3793     public void prefetchw(AMD64Address src) {
3794         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3795         prefix(src);
3796         emitByte(0x0f);
3797         emitByte(0x0D);
3798         emitOperandHelper(1, src, 0);
3799     }
3800 
3801     public void rdtsc() {
3802         emitByte(0x0F);
3803         emitByte(0x31);
3804     }
3805 
3806     /**
3807      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3808      * to crash the program (debugging etc.).
3809      */
3810     public void illegal() {
3811         emitByte(0x0f);
3812         emitByte(0x0b);
3813     }
3814 
3815     public void lfence() {
3816         emitByte(0x0f);
3817         emitByte(0xae);
3818         emitByte(0xe8);
3819     }
3820 
3821     public final void vptest(Register dst, Register src) {
3822         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3823     }
3824 
3825     public final void vpxor(Register dst, Register nds, Register src) {
3826         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3827     }
3828 
3829     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3830         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3831     }
3832 
3833     public final void vmovdqu(Register dst, AMD64Address src) {
3834         VexMoveOp.VMOVDQU32.emit(this, AVXSize.YMM, dst, src);
3835     }
3836 
3837     public final void vmovdqu(AMD64Address dst, Register src) {
3838         assert inRC(XMM, src);
3839         VexMoveOp.VMOVDQU32.emit(this, AVXSize.YMM, dst, src);
3840     }
3841 
3842     public final void vpmovzxbw(Register dst, AMD64Address src) {
3843         assert supports(CPUFeature.AVX2);
3844         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3845     }
3846 
3847     public final void vzeroupper() {
3848         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3849         emitByte(0x77);
3850     }
3851 
3852     // Insn: KORTESTD k1, k2
3853 
3854     // This instruction produces ZF or CF flags
3855     public final void kortestd(Register src1, Register src2) {
3856         assert supports(CPUFeature.AVX512BW);
3857         assert inRC(MASK, src1) && inRC(MASK, src2);
3858         // Code: VEX.L0.66.0F.W1 98 /r
3859         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, W1, true);
3860         emitByte(0x98);
3861         emitModRM(src1, src2);
3862     }
3863 
3864     // Insn: KORTESTQ k1, k2
3865 
3866     // This instruction produces ZF or CF flags
3867     public final void kortestq(Register src1, Register src2) {
3868         assert supports(CPUFeature.AVX512BW);
3869         assert inRC(MASK, src1) && inRC(MASK, src2);
3870         // Code: VEX.L0.0F.W1 98 /r
3871         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, W1, true);
3872         emitByte(0x98);
3873         emitModRM(src1, src2);
3874     }
3875 
3876     public final void kmovd(Register dst, Register src) {
3877         assert supports(CPUFeature.AVX512BW);
3878         assert inRC(MASK, dst) || inRC(CPU, dst);
3879         assert inRC(MASK, src) || inRC(CPU, src);
3880         assert !(inRC(CPU, dst) && inRC(CPU, src));
3881 
3882         if (inRC(MASK, dst)) {
3883             if (inRC(MASK, src)) {
3884                 // kmovd(KRegister dst, KRegister src):
3885                 // Insn: KMOVD k1, k2/m32
3886                 // Code: VEX.L0.66.0F.W1 90 /r
3887                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, W1, true);
3888                 emitByte(0x90);
3889                 emitModRM(dst, src);
3890             } else {
3891                 // kmovd(KRegister dst, Register src)
3892                 // Insn: KMOVD k1, r32
3893                 // Code: VEX.L0.F2.0F.W0 92 /r
3894                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, W0, true);
3895                 emitByte(0x92);
3896                 emitModRM(dst, src);
3897             }
3898         } else {
3899             if (inRC(MASK, src)) {
3900                 // kmovd(Register dst, KRegister src)
3901                 // Insn: KMOVD r32, k1
3902                 // Code: VEX.L0.F2.0F.W0 93 /r
3903                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, W0, true);
3904                 emitByte(0x93);
3905                 emitModRM(dst, src);
3906             } else {
3907                 throw GraalError.shouldNotReachHere();
3908             }
3909         }
3910     }
3911 
3912     public final void kmovq(Register dst, Register src) {
3913         assert supports(CPUFeature.AVX512BW);
3914         assert inRC(MASK, dst) || inRC(CPU, dst);
3915         assert inRC(MASK, src) || inRC(CPU, src);
3916         assert !(inRC(CPU, dst) && inRC(CPU, src));
3917 
3918         if (inRC(MASK, dst)) {
3919             if (inRC(MASK, src)) {
3920                 // kmovq(KRegister dst, KRegister src):
3921                 // Insn: KMOVQ k1, k2/m64
3922                 // Code: VEX.L0.0F.W1 90 /r
3923                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, W1, true);
3924                 emitByte(0x90);
3925                 emitModRM(dst, src);
3926             } else {
3927                 // kmovq(KRegister dst, Register src)
3928                 // Insn: KMOVQ k1, r64
3929                 // Code: VEX.L0.F2.0F.W1 92 /r
3930                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, W1, true);
3931                 emitByte(0x92);
3932                 emitModRM(dst, src);
3933             }
3934         } else {
3935             if (inRC(MASK, src)) {
3936                 // kmovq(Register dst, KRegister src)
3937                 // Insn: KMOVQ r64, k1
3938                 // Code: VEX.L0.F2.0F.W1 93 /r
3939                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, W1, true);
3940                 emitByte(0x93);
3941                 emitModRM(dst, src);
3942             } else {
3943                 throw GraalError.shouldNotReachHere();
3944             }
3945         }
3946     }
3947 
3948     // Insn: KTESTD k1, k2
3949 
3950     public final void ktestd(Register src1, Register src2) {
3951         assert supports(CPUFeature.AVX512BW);
3952         assert inRC(MASK, src1) && inRC(MASK, src2);
3953         // Code: VEX.L0.66.0F.W1 99 /r
3954         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, W1, true);
3955         emitByte(0x99);
3956         emitModRM(src1, src2);
3957     }
3958 
3959     public final void evmovdqu64(Register dst, AMD64Address src) {
3960         assert supports(CPUFeature.AVX512F);
3961         assert inRC(XMM, dst);
3962         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3963         emitByte(0x6F);
3964         emitOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3965     }
3966 
3967     // Insn: VPMOVZXBW zmm1, m256
3968 
3969     public final void evpmovzxbw(Register dst, AMD64Address src) {
3970         assert supports(CPUFeature.AVX512BW);
3971         assert inRC(XMM, dst);
3972         // Code: EVEX.512.66.0F38.WIG 30 /r
3973         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3974         emitByte(0x30);
3975         emitOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3976     }
3977 
3978     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3979         assert supports(CPUFeature.AVX512BW);
3980         assert inRC(MASK, kdst) && inRC(XMM, nds);
3981         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3982         emitByte(0x74);
3983         emitOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3984     }
3985 
3986     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3987     // -----
3988     // Insn: VMOVDQU16 zmm1, m512
3989 
3990     public final void evmovdqu16(Register dst, AMD64Address src) {
3991         assert supports(CPUFeature.AVX512BW);
3992         assert inRC(XMM, dst);
3993         // Code: EVEX.512.F2.0F.W1 6F /r
3994         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3995         emitByte(0x6F);
3996         emitOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3997     }
3998 
3999     // Insn: VMOVDQU16 zmm1, k1:z, m512
4000 
4001     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
4002         assert supports(CPUFeature.AVX512BW);
4003         assert inRC(XMM, dst) && inRC(MASK, mask);
4004         // Code: EVEX.512.F2.0F.W1 6F /r
4005         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
4006         emitByte(0x6F);
4007         emitOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
4008     }
4009 
4010     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
4011     // -----
4012     // Insn: VMOVDQU16 m512, zmm1
4013 
4014     public final void evmovdqu16(AMD64Address dst, Register src) {
4015         assert supports(CPUFeature.AVX512BW);
4016         assert inRC(XMM, src);
4017         // Code: EVEX.512.F2.0F.W1 7F /r
4018         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
4019         emitByte(0x7F);
4020         emitOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
4021     }
4022 
4023     // Insn: VMOVDQU16 m512, k1, zmm1
4024 
4025     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
4026         assert supports(CPUFeature.AVX512BW);
4027         assert inRC(MASK, mask) && inRC(XMM, src);
4028         // Code: EVEX.512.F2.0F.W1 7F /r
4029         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
4030         emitByte(0x7F);
4031         emitOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
4032     }
4033 
4034     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
4035     // -----
4036     // Insn: VPBROADCASTW zmm1, reg
4037 
4038     public final void evpbroadcastw(Register dst, Register src) {
4039         assert supports(CPUFeature.AVX512BW);
4040         assert inRC(XMM, dst) && inRC(CPU, src);
4041         // Code: EVEX.512.66.0F38.W0 7B /r
4042         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
4043         emitByte(0x7B);
4044         emitModRM(dst, src);
4045     }
4046 
4047     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
4048     // -----
4049     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
4050 
4051     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
4052         assert supports(CPUFeature.AVX512BW);
4053         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
4054         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
4055         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
4056         emitByte(0x3E);
4057         emitModRM(kdst, src);
4058         emitByte(vcc);
4059     }
4060 
4061     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
4062     // -----
4063     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
4064 
4065     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
4066         assert supports(CPUFeature.AVX512BW);
4067         assert inRC(MASK, kdst) && inRC(MASK, mask);
4068         assert inRC(XMM, nds) && inRC(XMM, src);
4069         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
4070         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
4071         emitByte(0x3E);
4072         emitModRM(kdst, src);
4073         emitByte(vcc);
4074     }
4075 
4076     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
4077     // -----
4078     // Insn: VPMOVWB m256, zmm2
4079 
4080     public final void evpmovwb(AMD64Address dst, Register src) {
4081         assert supports(CPUFeature.AVX512BW);
4082         assert inRC(XMM, src);
4083         // Code: EVEX.512.F3.0F38.W0 30 /r
4084         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
4085         emitByte(0x30);
4086         emitOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
4087     }
4088 
4089     // Insn: VPMOVWB m256, k1, zmm2
4090 
4091     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
4092         assert supports(CPUFeature.AVX512BW);
4093         assert inRC(MASK, mask) && inRC(XMM, src);
4094         // Code: EVEX.512.F3.0F38.W0 30 /r
4095         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
4096         emitByte(0x30);
4097         emitOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
4098     }
4099 
4100     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
4101     // -----
4102     // Insn: VPMOVZXBW zmm1, k1, m256
4103 
4104     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
4105         assert supports(CPUFeature.AVX512BW);
4106         assert inRC(MASK, mask) && inRC(XMM, dst);
4107         // Code: EVEX.512.66.0F38.WIG 30 /r
4108         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
4109         emitByte(0x30);
4110         emitOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
4111     }
4112 
4113 }