1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.asm.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.CPU;
  28 import static jdk.vm.ci.amd64.AMD64.MASK;
  29 import static jdk.vm.ci.amd64.AMD64.XMM;
  30 import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
  32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
  33 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
  34 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
  35 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
  36 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
  37 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
  38 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
  39 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
  40 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
  41 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
  42 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
  43 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
  44 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.B0;
  45 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z0;
  46 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.EVEXPrefixConfig.Z1;
  47 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.BYTE;
  48 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.DWORD;
  49 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PD;
  50 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.PS;
  51 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.QWORD;
  52 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SD;
  53 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.SS;
  54 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize.WORD;
  55 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L128;
  56 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.L256;
  57 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.LZ;
  58 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F;
  59 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F38;
  60 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.M_0F3A;
  61 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_;
  62 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_66;
  63 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F2;
  64 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.P_F3;
  65 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W0;
  66 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.W1;
  67 import static org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.VEXPrefixConfig.WIG;
  68 import static org.graalvm.compiler.core.common.NumUtil.isByte;
  69 import static org.graalvm.compiler.core.common.NumUtil.isInt;
  70 import static org.graalvm.compiler.core.common.NumUtil.isShiftCount;
  71 import static org.graalvm.compiler.core.common.NumUtil.isUByte;
  72 
  73 import java.util.EnumSet;
  74 
  75 import org.graalvm.compiler.asm.Label;
  76 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  77 import org.graalvm.compiler.asm.amd64.AVXKind.AVXSize;
  78 import org.graalvm.compiler.core.common.NumUtil;
  79 import org.graalvm.compiler.core.common.calc.Condition;
  80 import org.graalvm.compiler.debug.GraalError;
  81 
  82 import jdk.vm.ci.amd64.AMD64;
  83 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  84 import jdk.vm.ci.code.Register;
  85 import jdk.vm.ci.code.Register.RegisterCategory;
  86 import jdk.vm.ci.code.TargetDescription;
  87 
  88 /**
  89  * This class implements an assembler that can encode most X86 instructions.
  90  */
  91 public class AMD64Assembler extends AMD64BaseAssembler {
  92 
  93     /**
  94      * Constructs an assembler for the AMD64 architecture.
  95      */
  96     public AMD64Assembler(TargetDescription target) {
  97         super(target);
  98     }
  99 
 100     /**
 101      * The x86 condition codes used for conditional jumps/moves.
 102      */
 103     public enum ConditionFlag {
 104         Zero(0x4, "|zero|"),
 105         NotZero(0x5, "|nzero|"),
 106         Equal(0x4, "="),
 107         NotEqual(0x5, "!="),
 108         Less(0xc, "<"),
 109         LessEqual(0xe, "<="),
 110         Greater(0xf, ">"),
 111         GreaterEqual(0xd, ">="),
 112         Below(0x2, "|<|"),
 113         BelowEqual(0x6, "|<=|"),
 114         Above(0x7, "|>|"),
 115         AboveEqual(0x3, "|>=|"),
 116         Overflow(0x0, "|of|"),
 117         NoOverflow(0x1, "|nof|"),
 118         CarrySet(0x2, "|carry|"),
 119         CarryClear(0x3, "|ncarry|"),
 120         Negative(0x8, "|neg|"),
 121         Positive(0x9, "|pos|"),
 122         Parity(0xa, "|par|"),
 123         NoParity(0xb, "|npar|");
 124 
 125         private final int value;
 126         private final String operator;
 127 
 128         ConditionFlag(int value, String operator) {
 129             this.value = value;
 130             this.operator = operator;
 131         }
 132 
 133         public ConditionFlag negate() {
 134             switch (this) {
 135                 case Zero:
 136                     return NotZero;
 137                 case NotZero:
 138                     return Zero;
 139                 case Equal:
 140                     return NotEqual;
 141                 case NotEqual:
 142                     return Equal;
 143                 case Less:
 144                     return GreaterEqual;
 145                 case LessEqual:
 146                     return Greater;
 147                 case Greater:
 148                     return LessEqual;
 149                 case GreaterEqual:
 150                     return Less;
 151                 case Below:
 152                     return AboveEqual;
 153                 case BelowEqual:
 154                     return Above;
 155                 case Above:
 156                     return BelowEqual;
 157                 case AboveEqual:
 158                     return Below;
 159                 case Overflow:
 160                     return NoOverflow;
 161                 case NoOverflow:
 162                     return Overflow;
 163                 case CarrySet:
 164                     return CarryClear;
 165                 case CarryClear:
 166                     return CarrySet;
 167                 case Negative:
 168                     return Positive;
 169                 case Positive:
 170                     return Negative;
 171                 case Parity:
 172                     return NoParity;
 173                 case NoParity:
 174                     return Parity;
 175             }
 176             throw new IllegalArgumentException();
 177         }
 178 
 179         public int getValue() {
 180             return value;
 181         }
 182 
 183         @Override
 184         public String toString() {
 185             return operator;
 186         }
 187     }
 188 
 189     /**
 190      * Operand size and register type constraints.
 191      */
 192     private enum OpAssertion {
 193         ByteAssertion(CPU, CPU, BYTE),
 194         ByteOrLargerAssertion(CPU, CPU, BYTE, WORD, DWORD, QWORD),
 195         WordOrLargerAssertion(CPU, CPU, WORD, DWORD, QWORD),
 196         DwordOrLargerAssertion(CPU, CPU, DWORD, QWORD),
 197         WordOrDwordAssertion(CPU, CPU, WORD, QWORD),
 198         QwordAssertion(CPU, CPU, QWORD),
 199         FloatAssertion(XMM, XMM, SS, SD, PS, PD),
 200         PackedFloatAssertion(XMM, XMM, PS, PD),
 201         SingleAssertion(XMM, XMM, SS),
 202         DoubleAssertion(XMM, XMM, SD),
 203         PackedDoubleAssertion(XMM, XMM, PD),
 204         IntToFloatAssertion(XMM, CPU, DWORD, QWORD),
 205         FloatToIntAssertion(CPU, XMM, DWORD, QWORD);
 206 
 207         private final RegisterCategory resultCategory;
 208         private final RegisterCategory inputCategory;
 209         private final OperandSize[] allowedSizes;
 210 
 211         OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
 212             this.resultCategory = resultCategory;
 213             this.inputCategory = inputCategory;
 214             this.allowedSizes = allowedSizes;
 215         }
 216 
 217         protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
 218             assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
 219             assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
 220 
 221             for (OperandSize s : allowedSizes) {
 222                 if (size == s) {
 223                     return true;
 224                 }
 225             }
 226 
 227             assert false : "invalid operand size " + size + " used in " + op;
 228             return false;
 229         }
 230 
 231     }
 232 
 233     protected static final int P_0F = 0x0F;
 234     protected static final int P_0F38 = 0x380F;
 235     protected static final int P_0F3A = 0x3A0F;
 236 
 237     /**
 238      * Base class for AMD64 opcodes.
 239      */
 240     public static class AMD64Op {
 241 
 242         private final String opcode;
 243 
 244         protected final int prefix1;
 245         protected final int prefix2;
 246         protected final int op;
 247 
 248         private final boolean dstIsByte;
 249         private final boolean srcIsByte;
 250 
 251         private final OpAssertion assertion;
 252         private final CPUFeature feature;
 253 
 254         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 255             this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
 256         }
 257 
 258         protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 259             this.opcode = opcode;
 260             this.prefix1 = prefix1;
 261             this.prefix2 = prefix2;
 262             this.op = op;
 263 
 264             this.dstIsByte = dstIsByte;
 265             this.srcIsByte = srcIsByte;
 266 
 267             this.assertion = assertion;
 268             this.feature = feature;
 269         }
 270 
 271         protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
 272             if (prefix1 != 0) {
 273                 asm.emitByte(prefix1);
 274             }
 275             if (size.getSizePrefix() != 0) {
 276                 asm.emitByte(size.getSizePrefix());
 277             }
 278             int rexPrefix = 0x40 | rxb;
 279             if (size == QWORD) {
 280                 rexPrefix |= 0x08;
 281             }
 282             if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
 283                 asm.emitByte(rexPrefix);
 284             }
 285             if (prefix2 > 0xFF) {
 286                 asm.emitShort(prefix2);
 287             } else if (prefix2 > 0) {
 288                 asm.emitByte(prefix2);
 289             }
 290             asm.emitByte(op);
 291         }
 292 
 293         protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
 294             assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
 295             assert assertion.checkOperands(this, size, resultReg, inputReg);
 296             return true;
 297         }
 298 
 299         public OperandSize[] getAllowedSizes() {
 300             return assertion.allowedSizes;
 301         }
 302 
 303         protected final boolean isSSEInstruction() {
 304             if (feature == null) {
 305                 return false;
 306             }
 307             switch (feature) {
 308                 case SSE:
 309                 case SSE2:
 310                 case SSE3:
 311                 case SSSE3:
 312                 case SSE4A:
 313                 case SSE4_1:
 314                 case SSE4_2:
 315                     return true;
 316                 default:
 317                     return false;
 318             }
 319         }
 320 
 321         public final OpAssertion getAssertion() {
 322             return assertion;
 323         }
 324 
 325         @Override
 326         public String toString() {
 327             return opcode;
 328         }
 329     }
 330 
 331     /**
 332      * Base class for AMD64 opcodes with immediate operands.
 333      */
 334     public static class AMD64ImmOp extends AMD64Op {
 335 
 336         private final boolean immIsByte;
 337 
 338         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
 339             this(opcode, immIsByte, prefix, op, assertion, null);
 340         }
 341 
 342         protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 343             super(opcode, 0, prefix, op, assertion, feature);
 344             this.immIsByte = immIsByte;
 345         }
 346 
 347         protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
 348             if (immIsByte) {
 349                 assert imm == (byte) imm;
 350                 asm.emitByte(imm);
 351             } else {
 352                 size.emitImmediate(asm, imm);
 353             }
 354         }
 355 
 356         protected final int immediateSize(OperandSize size) {
 357             if (immIsByte) {
 358                 return 1;
 359             } else {
 360                 return size.getBytes();
 361             }
 362         }
 363     }
 364 
 365     /**
 366      * Opcode with operand order of either RM or MR for 2 address forms.
 367      */
 368     public abstract static class AMD64RROp extends AMD64Op {
 369 
 370         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 371             super(opcode, prefix1, prefix2, op, assertion, feature);
 372         }
 373 
 374         protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
 375             super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
 376         }
 377 
 378         public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
 379     }
 380 
 381     /**
 382      * Opcode with operand order of RM.
 383      */
 384     public static class AMD64RMOp extends AMD64RROp {
 385         // @formatter:off
 386         public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF, OpAssertion.ByteOrLargerAssertion);
 387         public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
 388         public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
 389         // POPCNT, TZCNT, and LZCNT support word operation. However, the legacy size prefix should
 390         // be emitted before the mandatory prefix 0xF3. Since we are not emitting bit count for
 391         // 16-bit operands, here we simply use DwordOrLargerAssertion.
 392         public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, OpAssertion.DwordOrLargerAssertion, CPUFeature.POPCNT);
 393         public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, OpAssertion.DwordOrLargerAssertion, CPUFeature.BMI1);
 394         public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, OpAssertion.DwordOrLargerAssertion, CPUFeature.LZCNT);
 395         public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.WordOrLargerAssertion);
 396         public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.DwordOrLargerAssertion);
 397         public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.WordOrLargerAssertion);
 398         public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.DwordOrLargerAssertion);
 399         public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordAssertion);
 400         public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
 401         public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
 402         public static final AMD64RMOp CMP    = new AMD64RMOp("CMP",                0x3B);
 403 
 404         // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
 405         public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 406         public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 407         public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 408         public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatAssertion, CPUFeature.SSE);
 409 
 410         // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
 411         public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
 412         public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
 413         // @formatter:on
 414 
 415         protected AMD64RMOp(String opcode, int op) {
 416             this(opcode, 0, op);
 417         }
 418 
 419         protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
 420             this(opcode, 0, op, assertion);
 421         }
 422 
 423         protected AMD64RMOp(String opcode, int prefix, int op) {
 424             this(opcode, 0, prefix, op, null);
 425         }
 426 
 427         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
 428             this(opcode, 0, prefix, op, assertion, null);
 429         }
 430 
 431         protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 432             this(opcode, 0, prefix, op, assertion, feature);
 433         }
 434 
 435         protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
 436             super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
 437         }
 438 
 439         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
 440             this(opcode, prefix1, prefix2, op, OpAssertion.WordOrLargerAssertion, feature);
 441         }
 442 
 443         protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 444             super(opcode, prefix1, prefix2, op, assertion, feature);
 445         }
 446 
 447         @Override
 448         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 449             assert verify(asm, size, dst, src);
 450             if (isSSEInstruction()) {
 451                 Register nds = Register.None;
 452                 switch (op) {
 453                     case 0x10:
 454                     case 0x51:
 455                         if ((size == SS) || (size == SD)) {
 456                             nds = dst;
 457                         }
 458                         break;
 459                     case 0x2A:
 460                     case 0x54:
 461                     case 0x55:
 462                     case 0x56:
 463                     case 0x57:
 464                     case 0x58:
 465                     case 0x59:
 466                     case 0x5A:
 467                     case 0x5C:
 468                     case 0x5D:
 469                     case 0x5E:
 470                     case 0x5F:
 471                         nds = dst;
 472                         break;
 473                     default:
 474                         break;
 475                 }
 476                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 477                 asm.emitByte(op);
 478                 asm.emitModRM(dst, src);
 479             } else {
 480                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 481                 asm.emitModRM(dst, src);
 482             }
 483         }
 484 
 485         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
 486             assert verify(asm, size, dst, null);
 487             if (isSSEInstruction()) {
 488                 Register nds = Register.None;
 489                 switch (op) {
 490                     case 0x51:
 491                         if ((size == SS) || (size == SD)) {
 492                             nds = dst;
 493                         }
 494                         break;
 495                     case 0x2A:
 496                     case 0x54:
 497                     case 0x55:
 498                     case 0x56:
 499                     case 0x57:
 500                     case 0x58:
 501                     case 0x59:
 502                     case 0x5A:
 503                     case 0x5C:
 504                     case 0x5D:
 505                     case 0x5E:
 506                     case 0x5F:
 507                         nds = dst;
 508                         break;
 509                     default:
 510                         break;
 511                 }
 512                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, size == QWORD);
 513                 asm.emitByte(op);
 514                 asm.emitOperandHelper(dst, src, 0);
 515             } else {
 516                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 517                 asm.emitOperandHelper(dst, src, 0);
 518             }
 519         }
 520     }
 521 
 522     /**
 523      * Opcode with operand order of MR.
 524      */
 525     public static class AMD64MROp extends AMD64RROp {
 526         // @formatter:off
 527         public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
 528         public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
 529 
 530         // MOVD and MOVQ are the same opcode, just with different operand size prefix
 531         // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
 532         public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 533         public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatAssertion, CPUFeature.SSE2);
 534 
 535         // MOVSS and MOVSD are the same opcode, just with different operand size prefix
 536         public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 537         public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatAssertion, CPUFeature.SSE);
 538         // @formatter:on
 539 
 540         protected AMD64MROp(String opcode, int op) {
 541             this(opcode, 0, op);
 542         }
 543 
 544         protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
 545             this(opcode, 0, op, assertion);
 546         }
 547 
 548         protected AMD64MROp(String opcode, int prefix, int op) {
 549             this(opcode, prefix, op, OpAssertion.WordOrLargerAssertion);
 550         }
 551 
 552         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
 553             this(opcode, prefix, op, assertion, null);
 554         }
 555 
 556         protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 557             this(opcode, 0, prefix, op, assertion, feature);
 558         }
 559 
 560         protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
 561             super(opcode, prefix1, prefix2, op, assertion, feature);
 562         }
 563 
 564         @Override
 565         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
 566             assert verify(asm, size, src, dst);
 567             if (isSSEInstruction()) {
 568                 Register nds = Register.None;
 569                 switch (op) {
 570                     case 0x11:
 571                         if ((size == SS) || (size == SD)) {
 572                             nds = src;
 573                         }
 574                         break;
 575                     default:
 576                         break;
 577                 }
 578                 asm.simdPrefix(src, nds, dst, size, prefix1, prefix2, size == QWORD);
 579                 asm.emitByte(op);
 580                 asm.emitModRM(src, dst);
 581             } else {
 582                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
 583                 asm.emitModRM(src, dst);
 584             }
 585         }
 586 
 587         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
 588             assert verify(asm, size, src, null);
 589             if (isSSEInstruction()) {
 590                 asm.simdPrefix(src, Register.None, dst, size, prefix1, prefix2, size == QWORD);
 591                 asm.emitByte(op);
 592             } else {
 593                 emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
 594             }
 595             asm.emitOperandHelper(src, dst, 0);
 596         }
 597     }
 598 
 599     /**
 600      * Opcodes with operand order of M.
 601      */
 602     public static class AMD64MOp extends AMD64Op {
 603         // @formatter:off
 604         public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
 605         public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
 606         public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
 607         public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
 608         public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
 609         public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
 610         public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
 611         public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
 612         public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
 613         public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.WordOrDwordAssertion);
 614         // @formatter:on
 615 
 616         private final int ext;
 617 
 618         protected AMD64MOp(String opcode, int op, int ext) {
 619             this(opcode, 0, op, ext);
 620         }
 621 
 622         protected AMD64MOp(String opcode, int prefix, int op, int ext) {
 623             this(opcode, prefix, op, ext, OpAssertion.WordOrLargerAssertion);
 624         }
 625 
 626         protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
 627             this(opcode, 0, op, ext, assertion);
 628         }
 629 
 630         protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
 631             super(opcode, 0, prefix, op, assertion, null);
 632             this.ext = ext;
 633         }
 634 
 635         public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
 636             assert verify(asm, size, dst, null);
 637             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 638             asm.emitModRM(ext, dst);
 639         }
 640 
 641         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
 642             assert verify(asm, size, null, null);
 643             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 644             asm.emitOperandHelper(ext, dst, 0);
 645         }
 646     }
 647 
 648     /**
 649      * Opcodes with operand order of MI.
 650      */
 651     public static class AMD64MIOp extends AMD64ImmOp {
 652         // @formatter:off
 653         public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
 654         public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
 655         public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
 656         // @formatter:on
 657 
 658         private final int ext;
 659 
 660         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
 661             this(opcode, immIsByte, op, ext, OpAssertion.WordOrLargerAssertion);
 662         }
 663 
 664         protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
 665             this(opcode, immIsByte, 0, op, ext, assertion);
 666         }
 667 
 668         protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
 669             super(opcode, immIsByte, prefix, op, assertion);
 670             this.ext = ext;
 671         }
 672 
 673         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
 674             emit(asm, size, dst, imm, false);
 675         }
 676 
 677         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm, boolean annotateImm) {
 678             assert verify(asm, size, dst, null);
 679             int insnPos = asm.position();
 680             emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
 681             asm.emitModRM(ext, dst);
 682             int immPos = asm.position();
 683             emitImmediate(asm, size, imm);
 684             int nextInsnPos = asm.position();
 685             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 686                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 687             }
 688         }
 689 
 690         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
 691             emit(asm, size, dst, imm, false);
 692         }
 693 
 694         public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm, boolean annotateImm) {
 695             assert verify(asm, size, null, null);
 696             int insnPos = asm.position();
 697             emitOpcode(asm, size, getRXB(null, dst), 0, 0);
 698             asm.emitOperandHelper(ext, dst, immediateSize(size));
 699             int immPos = asm.position();
 700             emitImmediate(asm, size, imm);
 701             int nextInsnPos = asm.position();
 702             if (annotateImm && asm.codePatchingAnnotationConsumer != null) {
 703                 asm.codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
 704             }
 705         }
 706     }
 707 
 708     /**
 709      * Opcodes with operand order of RMI.
 710      *
 711      * We only have one form of round as the operation is always treated with single variant input,
 712      * making its extension to 3 address forms redundant.
 713      */
 714     public static class AMD64RMIOp extends AMD64ImmOp {
 715         // @formatter:off
 716         public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
 717         public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
 718         public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 719         public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion, CPUFeature.SSE4_1);
 720         // @formatter:on
 721 
 722         protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
 723             this(opcode, immIsByte, 0, op, OpAssertion.WordOrLargerAssertion, null);
 724         }
 725 
 726         protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
 727             super(opcode, immIsByte, prefix, op, assertion, feature);
 728         }
 729 
 730         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
 731             assert verify(asm, size, dst, src);
 732             if (isSSEInstruction()) {
 733                 Register nds = Register.None;
 734                 switch (op) {
 735                     case 0x0A:
 736                     case 0x0B:
 737                         nds = dst;
 738                         break;
 739                     default:
 740                         break;
 741                 }
 742                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 743                 asm.emitByte(op);
 744                 asm.emitModRM(dst, src);
 745             } else {
 746                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
 747                 asm.emitModRM(dst, src);
 748             }
 749             emitImmediate(asm, size, imm);
 750         }
 751 
 752         public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
 753             assert verify(asm, size, dst, null);
 754             if (isSSEInstruction()) {
 755                 Register nds = Register.None;
 756                 switch (op) {
 757                     case 0x0A:
 758                     case 0x0B:
 759                         nds = dst;
 760                         break;
 761                     default:
 762                         break;
 763                 }
 764                 asm.simdPrefix(dst, nds, src, size, prefix1, prefix2, false);
 765                 asm.emitByte(op);
 766             } else {
 767                 emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
 768             }
 769             asm.emitOperandHelper(dst, src, immediateSize(size));
 770             emitImmediate(asm, size, imm);
 771         }
 772     }
 773 
 774     public static class SSEOp extends AMD64RMOp {
 775         // @formatter:off
 776         public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 777         public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SD",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatAssertion);
 778         public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 779         public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatToIntAssertion);
 780         public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatAssertion);
 781         public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
 782         public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatAssertion);
 783         public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatAssertion);
 784         public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatAssertion);
 785         public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatAssertion);
 786         public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
 787         public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
 788         public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
 789         public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
 790         public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
 791         public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
 792         public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
 793         public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
 794         // @formatter:on
 795 
 796         protected SSEOp(String opcode, int prefix, int op) {
 797             this(opcode, prefix, op, OpAssertion.FloatAssertion);
 798         }
 799 
 800         protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
 801             this(opcode, 0, prefix, op, assertion);
 802         }
 803 
 804         protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
 805             super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
 806         }
 807     }
 808 
 809     /**
 810      * Arithmetic operation with operand order of RM, MR or MI.
 811      */
 812     public static final class AMD64BinaryArithmetic {
 813         // @formatter:off
 814         public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
 815         public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
 816         public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
 817         public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
 818         public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
 819         public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
 820         public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
 821         public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
 822         // @formatter:on
 823 
 824         private final AMD64MIOp byteImmOp;
 825         private final AMD64MROp byteMrOp;
 826         private final AMD64RMOp byteRmOp;
 827 
 828         private final AMD64MIOp immOp;
 829         private final AMD64MIOp immSxOp;
 830         private final AMD64MROp mrOp;
 831         private final AMD64RMOp rmOp;
 832 
 833         private AMD64BinaryArithmetic(String opcode, int code) {
 834             int baseOp = code << 3;
 835 
 836             byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
 837             byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
 838             byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
 839 
 840             immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.WordOrLargerAssertion);
 841             immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.WordOrLargerAssertion);
 842             mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.WordOrLargerAssertion);
 843             rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.WordOrLargerAssertion);
 844         }
 845 
 846         public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
 847             if (size == BYTE) {
 848                 return byteImmOp;
 849             } else if (sx) {
 850                 return immSxOp;
 851             } else {
 852                 return immOp;
 853             }
 854         }
 855 
 856         public AMD64MROp getMROpcode(OperandSize size) {
 857             if (size == BYTE) {
 858                 return byteMrOp;
 859             } else {
 860                 return mrOp;
 861             }
 862         }
 863 
 864         public AMD64RMOp getRMOpcode(OperandSize size) {
 865             if (size == BYTE) {
 866                 return byteRmOp;
 867             } else {
 868                 return rmOp;
 869             }
 870         }
 871     }
 872 
 873     /**
 874      * Shift operation with operand order of M1, MC or MI.
 875      */
 876     public static final class AMD64Shift {
 877         // @formatter:off
 878         public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
 879         public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
 880         public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
 881         public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
 882         public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
 883         public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
 884         public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
 885         // @formatter:on
 886 
 887         public final AMD64MOp m1Op;
 888         public final AMD64MOp mcOp;
 889         public final AMD64MIOp miOp;
 890 
 891         private AMD64Shift(String opcode, int code) {
 892             m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.WordOrLargerAssertion);
 893             mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.WordOrLargerAssertion);
 894             miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.WordOrLargerAssertion);
 895         }
 896     }
 897 
 898     private enum VEXOpAssertion {
 899         AVX1(CPUFeature.AVX, CPUFeature.AVX),
 900         AVX1_2(CPUFeature.AVX, CPUFeature.AVX2),
 901         AVX2(CPUFeature.AVX2, CPUFeature.AVX2),
 902         AVX1_128ONLY(CPUFeature.AVX, null),
 903         AVX1_256ONLY(null, CPUFeature.AVX),
 904         AVX2_256ONLY(null, CPUFeature.AVX2),
 905         XMM_CPU(CPUFeature.AVX, null, XMM, null, CPU, null),
 906         XMM_XMM_CPU(CPUFeature.AVX, null, XMM, XMM, CPU, null),
 907         CPU_XMM(CPUFeature.AVX, null, CPU, null, XMM, null),
 908         AVX1_2_CPU_XMM(CPUFeature.AVX, CPUFeature.AVX2, CPU, null, XMM, null),
 909         BMI1(CPUFeature.BMI1, null, CPU, CPU, CPU, null),
 910         BMI2(CPUFeature.BMI2, null, CPU, CPU, CPU, null);
 911 
 912         private final CPUFeature l128feature;
 913         private final CPUFeature l256feature;
 914 
 915         private final RegisterCategory rCategory;
 916         private final RegisterCategory vCategory;
 917         private final RegisterCategory mCategory;
 918         private final RegisterCategory imm8Category;
 919 
 920         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature) {
 921             this(l128feature, l256feature, XMM, XMM, XMM, XMM);
 922         }
 923 
 924         VEXOpAssertion(CPUFeature l128feature, CPUFeature l256feature, RegisterCategory rCategory, RegisterCategory vCategory, RegisterCategory mCategory, RegisterCategory imm8Category) {
 925             this.l128feature = l128feature;
 926             this.l256feature = l256feature;
 927             this.rCategory = rCategory;
 928             this.vCategory = vCategory;
 929             this.mCategory = mCategory;
 930             this.imm8Category = imm8Category;
 931         }
 932 
 933         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m) {
 934             return check(arch, getLFlag(size), r, v, m, null);
 935         }
 936 
 937         public boolean check(AMD64 arch, AVXSize size, Register r, Register v, Register m, Register imm8) {
 938             return check(arch, getLFlag(size), r, v, m, imm8);
 939         }
 940 
 941         public boolean check(AMD64 arch, int l, Register r, Register v, Register m, Register imm8) {
 942             switch (l) {
 943                 case L128:
 944                     assert l128feature != null && arch.getFeatures().contains(l128feature) : "emitting illegal 128 bit instruction";
 945                     break;
 946                 case L256:
 947                     assert l256feature != null && arch.getFeatures().contains(l256feature) : "emitting illegal 256 bit instruction";
 948                     break;
 949             }
 950             if (r != null) {
 951                 assert r.getRegisterCategory().equals(rCategory);
 952             }
 953             if (v != null) {
 954                 assert v.getRegisterCategory().equals(vCategory);
 955             }
 956             if (m != null) {
 957                 assert m.getRegisterCategory().equals(mCategory);
 958             }
 959             if (imm8 != null) {
 960                 assert imm8.getRegisterCategory().equals(imm8Category);
 961             }
 962             return true;
 963         }
 964 
 965         public boolean supports(EnumSet<CPUFeature> features, AVXSize avxSize) {
 966             switch (avxSize) {
 967                 case XMM:
 968                     return l128feature != null && features.contains(l128feature);
 969                 case YMM:
 970                     return l256feature != null && features.contains(l256feature);
 971                 default:
 972                     throw GraalError.shouldNotReachHere();
 973             }
 974         }
 975     }
 976 
 977     /**
 978      * Base class for VEX-encoded instructions.
 979      */
 980     public static class VexOp {
 981         protected final int pp;
 982         protected final int mmmmm;
 983         protected final int w;
 984         protected final int op;
 985 
 986         private final String opcode;
 987         protected final VEXOpAssertion assertion;
 988 
 989         protected VexOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
 990             this.pp = pp;
 991             this.mmmmm = mmmmm;
 992             this.w = w;
 993             this.op = op;
 994             this.opcode = opcode;
 995             this.assertion = assertion;
 996         }
 997 
 998         public final boolean isSupported(AMD64Assembler vasm, AVXSize size) {
 999             return assertion.supports(((AMD64) vasm.target.arch).getFeatures(), size);
1000         }
1001 
1002         @Override
1003         public String toString() {
1004             return opcode;
1005         }
1006     }
1007 
1008     /**
1009      * VEX-encoded instructions with an operand order of RM, but the M operand must be a register.
1010      */
1011     public static class VexRROp extends VexOp {
1012         // @formatter:off
1013         public static final VexRROp VMASKMOVDQU = new VexRROp("VMASKMOVDQU", P_66, M_0F, WIG, 0xF7, VEXOpAssertion.AVX1_128ONLY);
1014         // @formatter:on
1015 
1016         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op) {
1017             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1018         }
1019 
1020         protected VexRROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1021             super(opcode, pp, mmmmm, w, op, assertion);
1022         }
1023 
1024         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1025             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1026             assert op != 0x1A || op != 0x5A;
1027             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1028             asm.emitByte(op);
1029             asm.emitModRM(dst, src);
1030         }
1031     }
1032 
1033     /**
1034      * VEX-encoded instructions with an operand order of RM.
1035      */
1036     public static class VexRMOp extends VexRROp {
1037         // @formatter:off
1038         public static final VexRMOp VCVTTSS2SI      = new VexRMOp("VCVTTSS2SI",      P_F3, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1039         public static final VexRMOp VCVTTSS2SQ      = new VexRMOp("VCVTTSS2SQ",      P_F3, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1040         public static final VexRMOp VCVTTSD2SI      = new VexRMOp("VCVTTSD2SI",      P_F2, M_0F,   W0,  0x2C, VEXOpAssertion.CPU_XMM);
1041         public static final VexRMOp VCVTTSD2SQ      = new VexRMOp("VCVTTSD2SQ",      P_F2, M_0F,   W1,  0x2C, VEXOpAssertion.CPU_XMM);
1042         public static final VexRMOp VCVTPS2PD       = new VexRMOp("VCVTPS2PD",       P_,   M_0F,   WIG, 0x5A);
1043         public static final VexRMOp VCVTPD2PS       = new VexRMOp("VCVTPD2PS",       P_66, M_0F,   WIG, 0x5A);
1044         public static final VexRMOp VCVTDQ2PS       = new VexRMOp("VCVTDQ2PS",       P_,   M_0F,   WIG, 0x5B);
1045         public static final VexRMOp VCVTTPS2DQ      = new VexRMOp("VCVTTPS2DQ",      P_F3, M_0F,   WIG, 0x5B);
1046         public static final VexRMOp VCVTTPD2DQ      = new VexRMOp("VCVTTPD2DQ",      P_66, M_0F,   WIG, 0xE6);
1047         public static final VexRMOp VCVTDQ2PD       = new VexRMOp("VCVTDQ2PD",       P_F3, M_0F,   WIG, 0xE6);
1048         public static final VexRMOp VBROADCASTSS    = new VexRMOp("VBROADCASTSS",    P_66, M_0F38, W0,  0x18);
1049         public static final VexRMOp VBROADCASTSD    = new VexRMOp("VBROADCASTSD",    P_66, M_0F38, W0,  0x19, VEXOpAssertion.AVX1_256ONLY);
1050         public static final VexRMOp VBROADCASTF128  = new VexRMOp("VBROADCASTF128",  P_66, M_0F38, W0,  0x1A, VEXOpAssertion.AVX1_256ONLY);
1051         public static final VexRMOp VPBROADCASTI128 = new VexRMOp("VPBROADCASTI128", P_66, M_0F38, W0,  0x5A, VEXOpAssertion.AVX2_256ONLY);
1052         public static final VexRMOp VPBROADCASTB    = new VexRMOp("VPBROADCASTB",    P_66, M_0F38, W0,  0x78, VEXOpAssertion.AVX2);
1053         public static final VexRMOp VPBROADCASTW    = new VexRMOp("VPBROADCASTW",    P_66, M_0F38, W0,  0x79, VEXOpAssertion.AVX2);
1054         public static final VexRMOp VPBROADCASTD    = new VexRMOp("VPBROADCASTD",    P_66, M_0F38, W0,  0x58, VEXOpAssertion.AVX2);
1055         public static final VexRMOp VPBROADCASTQ    = new VexRMOp("VPBROADCASTQ",    P_66, M_0F38, W0,  0x59, VEXOpAssertion.AVX2);
1056         public static final VexRMOp VPMOVMSKB       = new VexRMOp("VPMOVMSKB",       P_66, M_0F,   WIG, 0xD7, VEXOpAssertion.AVX1_2_CPU_XMM);
1057         public static final VexRMOp VPMOVSXBW       = new VexRMOp("VPMOVSXBW",       P_66, M_0F38, WIG, 0x20);
1058         public static final VexRMOp VPMOVSXBD       = new VexRMOp("VPMOVSXBD",       P_66, M_0F38, WIG, 0x21);
1059         public static final VexRMOp VPMOVSXBQ       = new VexRMOp("VPMOVSXBQ",       P_66, M_0F38, WIG, 0x22);
1060         public static final VexRMOp VPMOVSXWD       = new VexRMOp("VPMOVSXWD",       P_66, M_0F38, WIG, 0x23);
1061         public static final VexRMOp VPMOVSXWQ       = new VexRMOp("VPMOVSXWQ",       P_66, M_0F38, WIG, 0x24);
1062         public static final VexRMOp VPMOVSXDQ       = new VexRMOp("VPMOVSXDQ",       P_66, M_0F38, WIG, 0x25);
1063         public static final VexRMOp VPMOVZXBW       = new VexRMOp("VPMOVZXBW",       P_66, M_0F38, WIG, 0x30);
1064         public static final VexRMOp VPMOVZXBD       = new VexRMOp("VPMOVZXBD",       P_66, M_0F38, WIG, 0x31);
1065         public static final VexRMOp VPMOVZXBQ       = new VexRMOp("VPMOVZXBQ",       P_66, M_0F38, WIG, 0x32);
1066         public static final VexRMOp VPMOVZXWD       = new VexRMOp("VPMOVZXWD",       P_66, M_0F38, WIG, 0x33);
1067         public static final VexRMOp VPMOVZXWQ       = new VexRMOp("VPMOVZXWQ",       P_66, M_0F38, WIG, 0x34);
1068         public static final VexRMOp VPMOVZXDQ       = new VexRMOp("VPMOVZXDQ",       P_66, M_0F38, WIG, 0x35);
1069         public static final VexRMOp VPTEST          = new VexRMOp("VPTEST",          P_66, M_0F38, WIG, 0x17);
1070         public static final VexRMOp VSQRTPD         = new VexRMOp("VSQRTPD",         P_66, M_0F,   WIG, 0x51);
1071         public static final VexRMOp VSQRTPS         = new VexRMOp("VSQRTPS",         P_,   M_0F,   WIG, 0x51);
1072         public static final VexRMOp VSQRTSD         = new VexRMOp("VSQRTSD",         P_F2, M_0F,   WIG, 0x51);
1073         public static final VexRMOp VSQRTSS         = new VexRMOp("VSQRTSS",         P_F3, M_0F,   WIG, 0x51);
1074         public static final VexRMOp VUCOMISS        = new VexRMOp("VUCOMISS",        P_,   M_0F,   WIG, 0x2E);
1075         public static final VexRMOp VUCOMISD        = new VexRMOp("VUCOMISD",        P_66, M_0F,   WIG, 0x2E);
1076         // @formatter:on
1077 
1078         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op) {
1079             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1080         }
1081 
1082         protected VexRMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1083             super(opcode, pp, mmmmm, w, op, assertion);
1084         }
1085 
1086         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1087             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1088             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1089             asm.emitByte(op);
1090             asm.emitOperandHelper(dst, src, 0);
1091         }
1092     }
1093 
1094     /**
1095      * VEX-encoded move instructions.
1096      * <p>
1097      * These instructions have two opcodes: op is the forward move instruction with an operand order
1098      * of RM, and opReverse is the reverse move instruction with an operand order of MR.
1099      */
1100     public static final class VexMoveOp extends VexRMOp {
1101         // @formatter:off
1102         public static final VexMoveOp VMOVDQA = new VexMoveOp("VMOVDQA", P_66, M_0F, WIG, 0x6F, 0x7F);
1103         public static final VexMoveOp VMOVDQU = new VexMoveOp("VMOVDQU", P_F3, M_0F, WIG, 0x6F, 0x7F);
1104         public static final VexMoveOp VMOVAPS = new VexMoveOp("VMOVAPS", P_,   M_0F, WIG, 0x28, 0x29);
1105         public static final VexMoveOp VMOVAPD = new VexMoveOp("VMOVAPD", P_66, M_0F, WIG, 0x28, 0x29);
1106         public static final VexMoveOp VMOVUPS = new VexMoveOp("VMOVUPS", P_,   M_0F, WIG, 0x10, 0x11);
1107         public static final VexMoveOp VMOVUPD = new VexMoveOp("VMOVUPD", P_66, M_0F, WIG, 0x10, 0x11);
1108         public static final VexMoveOp VMOVSS  = new VexMoveOp("VMOVSS",  P_F3, M_0F, WIG, 0x10, 0x11);
1109         public static final VexMoveOp VMOVSD  = new VexMoveOp("VMOVSD",  P_F2, M_0F, WIG, 0x10, 0x11);
1110         public static final VexMoveOp VMOVD   = new VexMoveOp("VMOVD",   P_66, M_0F, W0,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1111         public static final VexMoveOp VMOVQ   = new VexMoveOp("VMOVQ",   P_66, M_0F, W1,  0x6E, 0x7E, VEXOpAssertion.XMM_CPU);
1112         // @formatter:on
1113 
1114         private final int opReverse;
1115 
1116         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1117             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1118         }
1119 
1120         private VexMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1121             super(opcode, pp, mmmmm, w, op, assertion);
1122             this.opReverse = opReverse;
1123         }
1124 
1125         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src) {
1126             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1127             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1128             asm.emitByte(opReverse);
1129             asm.emitOperandHelper(src, dst, 0);
1130         }
1131 
1132         public void emitReverse(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1133             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1134             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1135             asm.emitByte(opReverse);
1136             asm.emitModRM(src, dst);
1137         }
1138     }
1139 
1140     public interface VexRRIOp {
1141         void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8);
1142     }
1143 
1144     /**
1145      * VEX-encoded instructions with an operand order of RMI.
1146      */
1147     public static final class VexRMIOp extends VexOp implements VexRRIOp {
1148         // @formatter:off
1149         public static final VexRMIOp VPERMQ   = new VexRMIOp("VPERMQ",   P_66, M_0F3A, W1,  0x00, VEXOpAssertion.AVX2_256ONLY);
1150         public static final VexRMIOp VPSHUFLW = new VexRMIOp("VPSHUFLW", P_F2, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1151         public static final VexRMIOp VPSHUFHW = new VexRMIOp("VPSHUFHW", P_F3, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1152         public static final VexRMIOp VPSHUFD  = new VexRMIOp("VPSHUFD",  P_66, M_0F,   WIG, 0x70, VEXOpAssertion.AVX1_2);
1153         // @formatter:on
1154 
1155         private VexRMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1156             super(opcode, pp, mmmmm, w, op, assertion);
1157         }
1158 
1159         @Override
1160         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1161             assert assertion.check((AMD64) asm.target.arch, size, dst, null, src);
1162             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1163             asm.emitByte(op);
1164             asm.emitModRM(dst, src);
1165             asm.emitByte(imm8);
1166         }
1167 
1168         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src, int imm8) {
1169             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1170             asm.vexPrefix(dst, Register.None, src, size, pp, mmmmm, w, false);
1171             asm.emitByte(op);
1172             asm.emitOperandHelper(dst, src, 1);
1173             asm.emitByte(imm8);
1174         }
1175     }
1176 
1177     /**
1178      * VEX-encoded instructions with an operand order of MRI.
1179      */
1180     public static final class VexMRIOp extends VexOp implements VexRRIOp {
1181         // @formatter:off
1182         public static final VexMRIOp VEXTRACTF128 = new VexMRIOp("VEXTRACTF128", P_66, M_0F3A, W0, 0x19, VEXOpAssertion.AVX1_256ONLY);
1183         public static final VexMRIOp VEXTRACTI128 = new VexMRIOp("VEXTRACTI128", P_66, M_0F3A, W0, 0x39, VEXOpAssertion.AVX2_256ONLY);
1184         public static final VexMRIOp VPEXTRB      = new VexMRIOp("VPEXTRB",      P_66, M_0F3A, W0, 0x14, VEXOpAssertion.XMM_CPU);
1185         public static final VexMRIOp VPEXTRW      = new VexMRIOp("VPEXTRW",      P_66, M_0F3A, W0, 0x15, VEXOpAssertion.XMM_CPU);
1186         public static final VexMRIOp VPEXTRD      = new VexMRIOp("VPEXTRD",      P_66, M_0F3A, W0, 0x16, VEXOpAssertion.XMM_CPU);
1187         public static final VexMRIOp VPEXTRQ      = new VexMRIOp("VPEXTRQ",      P_66, M_0F3A, W1, 0x16, VEXOpAssertion.XMM_CPU);
1188         // @formatter:on
1189 
1190         private VexMRIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1191             super(opcode, pp, mmmmm, w, op, assertion);
1192         }
1193 
1194         @Override
1195         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1196             assert assertion.check((AMD64) asm.target.arch, size, src, null, dst);
1197             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1198             asm.emitByte(op);
1199             asm.emitModRM(src, dst);
1200             asm.emitByte(imm8);
1201         }
1202 
1203         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register src, int imm8) {
1204             assert assertion.check((AMD64) asm.target.arch, size, src, null, null);
1205             asm.vexPrefix(src, Register.None, dst, size, pp, mmmmm, w, false);
1206             asm.emitByte(op);
1207             asm.emitOperandHelper(src, dst, 1);
1208             asm.emitByte(imm8);
1209         }
1210     }
1211 
1212     /**
1213      * VEX-encoded instructions with an operand order of RVMR.
1214      */
1215     public static class VexRVMROp extends VexOp {
1216         // @formatter:off
1217         public static final VexRVMROp VPBLENDVB  = new VexRVMROp("VPBLENDVB",  P_66, M_0F3A, W0, 0x4C, VEXOpAssertion.AVX1_2);
1218         public static final VexRVMROp VPBLENDVPS = new VexRVMROp("VPBLENDVPS", P_66, M_0F3A, W0, 0x4A, VEXOpAssertion.AVX1);
1219         public static final VexRVMROp VPBLENDVPD = new VexRVMROp("VPBLENDVPD", P_66, M_0F3A, W0, 0x4B, VEXOpAssertion.AVX1);
1220         // @formatter:on
1221 
1222         protected VexRVMROp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1223             super(opcode, pp, mmmmm, w, op, assertion);
1224         }
1225 
1226         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, Register src2) {
1227             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, src2);
1228             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1229             asm.emitByte(op);
1230             asm.emitModRM(dst, src2);
1231             asm.emitByte(mask.encoding() << 4);
1232         }
1233 
1234         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, Register src1, AMD64Address src2) {
1235             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, src1, null);
1236             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1237             asm.emitByte(op);
1238             asm.emitOperandHelper(dst, src2, 0);
1239             asm.emitByte(mask.encoding() << 4);
1240         }
1241     }
1242 
1243     /**
1244      * VEX-encoded instructions with an operand order of RVM.
1245      */
1246     public static class VexRVMOp extends VexOp {
1247         // @formatter:off
1248         public static final VexRVMOp VANDPS    = new VexRVMOp("VANDPS",    P_,   M_0F,   WIG, 0x54);
1249         public static final VexRVMOp VANDPD    = new VexRVMOp("VANDPD",    P_66, M_0F,   WIG, 0x54);
1250         public static final VexRVMOp VANDNPS   = new VexRVMOp("VANDNPS",   P_,   M_0F,   WIG, 0x55);
1251         public static final VexRVMOp VANDNPD   = new VexRVMOp("VANDNPD",   P_66, M_0F,   WIG, 0x55);
1252         public static final VexRVMOp VORPS     = new VexRVMOp("VORPS",     P_,   M_0F,   WIG, 0x56);
1253         public static final VexRVMOp VORPD     = new VexRVMOp("VORPD",     P_66, M_0F,   WIG, 0x56);
1254         public static final VexRVMOp VXORPS    = new VexRVMOp("VXORPS",    P_,   M_0F,   WIG, 0x57);
1255         public static final VexRVMOp VXORPD    = new VexRVMOp("VXORPD",    P_66, M_0F,   WIG, 0x57);
1256         public static final VexRVMOp VADDPS    = new VexRVMOp("VADDPS",    P_,   M_0F,   WIG, 0x58);
1257         public static final VexRVMOp VADDPD    = new VexRVMOp("VADDPD",    P_66, M_0F,   WIG, 0x58);
1258         public static final VexRVMOp VADDSS    = new VexRVMOp("VADDSS",    P_F3, M_0F,   WIG, 0x58);
1259         public static final VexRVMOp VADDSD    = new VexRVMOp("VADDSD",    P_F2, M_0F,   WIG, 0x58);
1260         public static final VexRVMOp VMULPS    = new VexRVMOp("VMULPS",    P_,   M_0F,   WIG, 0x59);
1261         public static final VexRVMOp VMULPD    = new VexRVMOp("VMULPD",    P_66, M_0F,   WIG, 0x59);
1262         public static final VexRVMOp VMULSS    = new VexRVMOp("VMULSS",    P_F3, M_0F,   WIG, 0x59);
1263         public static final VexRVMOp VMULSD    = new VexRVMOp("VMULSD",    P_F2, M_0F,   WIG, 0x59);
1264         public static final VexRVMOp VSUBPS    = new VexRVMOp("VSUBPS",    P_,   M_0F,   WIG, 0x5C);
1265         public static final VexRVMOp VSUBPD    = new VexRVMOp("VSUBPD",    P_66, M_0F,   WIG, 0x5C);
1266         public static final VexRVMOp VSUBSS    = new VexRVMOp("VSUBSS",    P_F3, M_0F,   WIG, 0x5C);
1267         public static final VexRVMOp VSUBSD    = new VexRVMOp("VSUBSD",    P_F2, M_0F,   WIG, 0x5C);
1268         public static final VexRVMOp VMINPS    = new VexRVMOp("VMINPS",    P_,   M_0F,   WIG, 0x5D);
1269         public static final VexRVMOp VMINPD    = new VexRVMOp("VMINPD",    P_66, M_0F,   WIG, 0x5D);
1270         public static final VexRVMOp VMINSS    = new VexRVMOp("VMINSS",    P_F3, M_0F,   WIG, 0x5D);
1271         public static final VexRVMOp VMINSD    = new VexRVMOp("VMINSD",    P_F2, M_0F,   WIG, 0x5D);
1272         public static final VexRVMOp VDIVPS    = new VexRVMOp("VDIVPS",    P_,   M_0F,   WIG, 0x5E);
1273         public static final VexRVMOp VDIVPD    = new VexRVMOp("VDIVPD",    P_66, M_0F,   WIG, 0x5E);
1274         public static final VexRVMOp VDIVSS    = new VexRVMOp("VDIVPS",    P_F3, M_0F,   WIG, 0x5E);
1275         public static final VexRVMOp VDIVSD    = new VexRVMOp("VDIVPD",    P_F2, M_0F,   WIG, 0x5E);
1276         public static final VexRVMOp VMAXPS    = new VexRVMOp("VMAXPS",    P_,   M_0F,   WIG, 0x5F);
1277         public static final VexRVMOp VMAXPD    = new VexRVMOp("VMAXPD",    P_66, M_0F,   WIG, 0x5F);
1278         public static final VexRVMOp VMAXSS    = new VexRVMOp("VMAXSS",    P_F3, M_0F,   WIG, 0x5F);
1279         public static final VexRVMOp VMAXSD    = new VexRVMOp("VMAXSD",    P_F2, M_0F,   WIG, 0x5F);
1280         public static final VexRVMOp VADDSUBPS = new VexRVMOp("VADDSUBPS", P_F2, M_0F,   WIG, 0xD0);
1281         public static final VexRVMOp VADDSUBPD = new VexRVMOp("VADDSUBPD", P_66, M_0F,   WIG, 0xD0);
1282         public static final VexRVMOp VPAND     = new VexRVMOp("VPAND",     P_66, M_0F,   WIG, 0xDB, VEXOpAssertion.AVX1_2);
1283         public static final VexRVMOp VPOR      = new VexRVMOp("VPOR",      P_66, M_0F,   WIG, 0xEB, VEXOpAssertion.AVX1_2);
1284         public static final VexRVMOp VPXOR     = new VexRVMOp("VPXOR",     P_66, M_0F,   WIG, 0xEF, VEXOpAssertion.AVX1_2);
1285         public static final VexRVMOp VPADDB    = new VexRVMOp("VPADDB",    P_66, M_0F,   WIG, 0xFC, VEXOpAssertion.AVX1_2);
1286         public static final VexRVMOp VPADDW    = new VexRVMOp("VPADDW",    P_66, M_0F,   WIG, 0xFD, VEXOpAssertion.AVX1_2);
1287         public static final VexRVMOp VPADDD    = new VexRVMOp("VPADDD",    P_66, M_0F,   WIG, 0xFE, VEXOpAssertion.AVX1_2);
1288         public static final VexRVMOp VPADDQ    = new VexRVMOp("VPADDQ",    P_66, M_0F,   WIG, 0xD4, VEXOpAssertion.AVX1_2);
1289         public static final VexRVMOp VPMULHUW  = new VexRVMOp("VPMULHUW",  P_66, M_0F,   WIG, 0xE4, VEXOpAssertion.AVX1_2);
1290         public static final VexRVMOp VPMULHW   = new VexRVMOp("VPMULHW",   P_66, M_0F,   WIG, 0xE5, VEXOpAssertion.AVX1_2);
1291         public static final VexRVMOp VPMULLW   = new VexRVMOp("VPMULLW",   P_66, M_0F,   WIG, 0xD5, VEXOpAssertion.AVX1_2);
1292         public static final VexRVMOp VPMULLD   = new VexRVMOp("VPMULLD",   P_66, M_0F38, WIG, 0x40, VEXOpAssertion.AVX1_2);
1293         public static final VexRVMOp VPSUBB    = new VexRVMOp("VPSUBB",    P_66, M_0F,   WIG, 0xF8, VEXOpAssertion.AVX1_2);
1294         public static final VexRVMOp VPSUBW    = new VexRVMOp("VPSUBW",    P_66, M_0F,   WIG, 0xF9, VEXOpAssertion.AVX1_2);
1295         public static final VexRVMOp VPSUBD    = new VexRVMOp("VPSUBD",    P_66, M_0F,   WIG, 0xFA, VEXOpAssertion.AVX1_2);
1296         public static final VexRVMOp VPSUBQ    = new VexRVMOp("VPSUBQ",    P_66, M_0F,   WIG, 0xFB, VEXOpAssertion.AVX1_2);
1297         public static final VexRVMOp VPSHUFB   = new VexRVMOp("VPSHUFB",   P_66, M_0F38, WIG, 0x00, VEXOpAssertion.AVX1_2);
1298         public static final VexRVMOp VCVTSD2SS = new VexRVMOp("VCVTSD2SS", P_F2, M_0F,   WIG, 0x5A);
1299         public static final VexRVMOp VCVTSS2SD = new VexRVMOp("VCVTSS2SD", P_F3, M_0F,   WIG, 0x5A);
1300         public static final VexRVMOp VCVTSI2SD = new VexRVMOp("VCVTSI2SD", P_F2, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1301         public static final VexRVMOp VCVTSQ2SD = new VexRVMOp("VCVTSQ2SD", P_F2, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1302         public static final VexRVMOp VCVTSI2SS = new VexRVMOp("VCVTSI2SS", P_F3, M_0F,   W0,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1303         public static final VexRVMOp VCVTSQ2SS = new VexRVMOp("VCVTSQ2SS", P_F3, M_0F,   W1,  0x2A, VEXOpAssertion.XMM_XMM_CPU);
1304         public static final VexRVMOp VPCMPEQB  = new VexRVMOp("VPCMPEQB",  P_66, M_0F,   WIG, 0x74, VEXOpAssertion.AVX1_2);
1305         public static final VexRVMOp VPCMPEQW  = new VexRVMOp("VPCMPEQW",  P_66, M_0F,   WIG, 0x75, VEXOpAssertion.AVX1_2);
1306         public static final VexRVMOp VPCMPEQD  = new VexRVMOp("VPCMPEQD",  P_66, M_0F,   WIG, 0x76, VEXOpAssertion.AVX1_2);
1307         public static final VexRVMOp VPCMPEQQ  = new VexRVMOp("VPCMPEQQ",  P_66, M_0F38, WIG, 0x29, VEXOpAssertion.AVX1_2);
1308         public static final VexRVMOp VPCMPGTB  = new VexRVMOp("VPCMPGTB",  P_66, M_0F,   WIG, 0x64, VEXOpAssertion.AVX1_2);
1309         public static final VexRVMOp VPCMPGTW  = new VexRVMOp("VPCMPGTW",  P_66, M_0F,   WIG, 0x65, VEXOpAssertion.AVX1_2);
1310         public static final VexRVMOp VPCMPGTD  = new VexRVMOp("VPCMPGTD",  P_66, M_0F,   WIG, 0x66, VEXOpAssertion.AVX1_2);
1311         public static final VexRVMOp VPCMPGTQ  = new VexRVMOp("VPCMPGTQ",  P_66, M_0F38, WIG, 0x37, VEXOpAssertion.AVX1_2);
1312         // @formatter:on
1313 
1314         private VexRVMOp(String opcode, int pp, int mmmmm, int w, int op) {
1315             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1316         }
1317 
1318         protected VexRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1319             super(opcode, pp, mmmmm, w, op, assertion);
1320         }
1321 
1322         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1323             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1324             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1325             asm.emitByte(op);
1326             asm.emitModRM(dst, src2);
1327         }
1328 
1329         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1330             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1331             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1332             asm.emitByte(op);
1333             asm.emitOperandHelper(dst, src2, 0);
1334         }
1335     }
1336 
1337     public static final class VexGeneralPurposeRVMOp extends VexRVMOp {
1338         // @formatter:off
1339         public static final VexGeneralPurposeRVMOp ANDN   = new VexGeneralPurposeRVMOp("ANDN",   P_,   M_0F38, WIG, 0xF2, VEXOpAssertion.BMI1);
1340         public static final VexGeneralPurposeRVMOp MULX   = new VexGeneralPurposeRVMOp("MULX",   P_F2, M_0F38, WIG, 0xF6, VEXOpAssertion.BMI2);
1341         public static final VexGeneralPurposeRVMOp PDEP   = new VexGeneralPurposeRVMOp("PDEP",   P_F2, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1342         public static final VexGeneralPurposeRVMOp PEXT   = new VexGeneralPurposeRVMOp("PEXT",   P_F3, M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1343         // @formatter:on
1344 
1345         private VexGeneralPurposeRVMOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1346             super(opcode, pp, mmmmm, w, op, assertion);
1347         }
1348 
1349         @Override
1350         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1351             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, src2, null);
1352             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1353             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1354             asm.emitByte(op);
1355             asm.emitModRM(dst, src2);
1356         }
1357 
1358         @Override
1359         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2) {
1360             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src1, null, null);
1361             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1362             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1363             asm.emitByte(op);
1364             asm.emitOperandHelper(dst, src2, 0);
1365         }
1366     }
1367 
1368     public static final class VexGeneralPurposeRMVOp extends VexOp {
1369         // @formatter:off
1370         public static final VexGeneralPurposeRMVOp BEXTR  = new VexGeneralPurposeRMVOp("BEXTR",  P_,   M_0F38, WIG, 0xF7, VEXOpAssertion.BMI1);
1371         public static final VexGeneralPurposeRMVOp BZHI   = new VexGeneralPurposeRMVOp("BZHI",   P_,   M_0F38, WIG, 0xF5, VEXOpAssertion.BMI2);
1372         public static final VexGeneralPurposeRMVOp SARX   = new VexGeneralPurposeRMVOp("SARX",   P_F3, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1373         public static final VexGeneralPurposeRMVOp SHRX   = new VexGeneralPurposeRMVOp("SHRX",   P_F2, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1374         public static final VexGeneralPurposeRMVOp SHLX   = new VexGeneralPurposeRMVOp("SHLX",   P_66, M_0F38, WIG, 0xF7, VEXOpAssertion.BMI2);
1375         // @formatter:on
1376 
1377         private VexGeneralPurposeRMVOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1378             super(opcode, pp, mmmmm, w, op, assertion);
1379         }
1380 
1381         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2) {
1382             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, src1, null);
1383             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1384             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1385             asm.emitByte(op);
1386             asm.emitModRM(dst, src1);
1387         }
1388 
1389         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src1, Register src2) {
1390             assert assertion.check((AMD64) asm.target.arch, LZ, dst, src2, null, null);
1391             assert size == AVXSize.DWORD || size == AVXSize.QWORD;
1392             asm.vexPrefix(dst, src2, src1, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1393             asm.emitByte(op);
1394             asm.emitOperandHelper(dst, src1, 0);
1395         }
1396     }
1397 
1398     public static final class VexGeneralPurposeRMOp extends VexRMOp {
1399         // @formatter:off
1400         public static final VexGeneralPurposeRMOp BLSI    = new VexGeneralPurposeRMOp("BLSI",   P_,    M_0F38, WIG, 0xF3, 3, VEXOpAssertion.BMI1);
1401         public static final VexGeneralPurposeRMOp BLSMSK  = new VexGeneralPurposeRMOp("BLSMSK", P_,    M_0F38, WIG, 0xF3, 2, VEXOpAssertion.BMI1);
1402         public static final VexGeneralPurposeRMOp BLSR    = new VexGeneralPurposeRMOp("BLSR",   P_,    M_0F38, WIG, 0xF3, 1, VEXOpAssertion.BMI1);
1403         // @formatter:on
1404         private final int ext;
1405 
1406         private VexGeneralPurposeRMOp(String opcode, int pp, int mmmmm, int w, int op, int ext, VEXOpAssertion assertion) {
1407             super(opcode, pp, mmmmm, w, op, assertion);
1408             this.ext = ext;
1409         }
1410 
1411         @Override
1412         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src) {
1413             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1414             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1415             asm.emitByte(op);
1416             asm.emitModRM(ext, src);
1417         }
1418 
1419         @Override
1420         public void emit(AMD64Assembler asm, AVXSize size, Register dst, AMD64Address src) {
1421             assert assertion.check((AMD64) asm.target.arch, size, dst, null, null);
1422             asm.vexPrefix(AMD64.cpuRegisters[ext], dst, src, size, pp, mmmmm, size == AVXSize.DWORD ? W0 : W1, false);
1423             asm.emitByte(op);
1424             asm.emitOperandHelper(ext, src, 0);
1425         }
1426     }
1427 
1428     /**
1429      * VEX-encoded shift instructions with an operand order of either RVM or VMI.
1430      */
1431     public static final class VexShiftOp extends VexRVMOp implements VexRRIOp {
1432         // @formatter:off
1433         public static final VexShiftOp VPSRLW = new VexShiftOp("VPSRLW", P_66, M_0F, WIG, 0xD1, 0x71, 2);
1434         public static final VexShiftOp VPSRLD = new VexShiftOp("VPSRLD", P_66, M_0F, WIG, 0xD2, 0x72, 2);
1435         public static final VexShiftOp VPSRLQ = new VexShiftOp("VPSRLQ", P_66, M_0F, WIG, 0xD3, 0x73, 2);
1436         public static final VexShiftOp VPSRAW = new VexShiftOp("VPSRAW", P_66, M_0F, WIG, 0xE1, 0x71, 4);
1437         public static final VexShiftOp VPSRAD = new VexShiftOp("VPSRAD", P_66, M_0F, WIG, 0xE2, 0x72, 4);
1438         public static final VexShiftOp VPSLLW = new VexShiftOp("VPSLLW", P_66, M_0F, WIG, 0xF1, 0x71, 6);
1439         public static final VexShiftOp VPSLLD = new VexShiftOp("VPSLLD", P_66, M_0F, WIG, 0xF2, 0x72, 6);
1440         public static final VexShiftOp VPSLLQ = new VexShiftOp("VPSLLQ", P_66, M_0F, WIG, 0xF3, 0x73, 6);
1441         // @formatter:on
1442 
1443         private final int immOp;
1444         private final int r;
1445 
1446         private VexShiftOp(String opcode, int pp, int mmmmm, int w, int op, int immOp, int r) {
1447             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1_2);
1448             this.immOp = immOp;
1449             this.r = r;
1450         }
1451 
1452         @Override
1453         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src, int imm8) {
1454             assert assertion.check((AMD64) asm.target.arch, size, null, dst, src);
1455             asm.vexPrefix(null, dst, src, size, pp, mmmmm, w, false);
1456             asm.emitByte(immOp);
1457             asm.emitModRM(r, src);
1458             asm.emitByte(imm8);
1459         }
1460     }
1461 
1462     public static final class VexMaskMoveOp extends VexOp {
1463         // @formatter:off
1464         public static final VexMaskMoveOp VMASKMOVPS = new VexMaskMoveOp("VMASKMOVPS", P_66, M_0F38, W0, 0x2C, 0x2E);
1465         public static final VexMaskMoveOp VMASKMOVPD = new VexMaskMoveOp("VMASKMOVPD", P_66, M_0F38, W0, 0x2D, 0x2F);
1466         public static final VexMaskMoveOp VPMASKMOVD = new VexMaskMoveOp("VPMASKMOVD", P_66, M_0F38, W0, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1467         public static final VexMaskMoveOp VPMASKMOVQ = new VexMaskMoveOp("VPMASKMOVQ", P_66, M_0F38, W1, 0x8C, 0x8E, VEXOpAssertion.AVX2);
1468         // @formatter:on
1469 
1470         private final int opReverse;
1471 
1472         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse) {
1473             this(opcode, pp, mmmmm, w, op, opReverse, VEXOpAssertion.AVX1);
1474         }
1475 
1476         private VexMaskMoveOp(String opcode, int pp, int mmmmm, int w, int op, int opReverse, VEXOpAssertion assertion) {
1477             super(opcode, pp, mmmmm, w, op, assertion);
1478             this.opReverse = opReverse;
1479         }
1480 
1481         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register mask, AMD64Address src) {
1482             assert assertion.check((AMD64) asm.target.arch, size, dst, mask, null);
1483             asm.vexPrefix(dst, mask, src, size, pp, mmmmm, w, false);
1484             asm.emitByte(op);
1485             asm.emitOperandHelper(dst, src, 0);
1486         }
1487 
1488         public void emit(AMD64Assembler asm, AVXSize size, AMD64Address dst, Register mask, Register src) {
1489             assert assertion.check((AMD64) asm.target.arch, size, src, mask, null);
1490             asm.vexPrefix(src, mask, dst, size, pp, mmmmm, w, false);
1491             asm.emitByte(opReverse);
1492             asm.emitOperandHelper(src, dst, 0);
1493         }
1494     }
1495 
1496     /**
1497      * VEX-encoded instructions with an operand order of RVMI.
1498      */
1499     public static final class VexRVMIOp extends VexOp {
1500         // @formatter:off
1501         public static final VexRVMIOp VSHUFPS     = new VexRVMIOp("VSHUFPS",     P_,   M_0F,   WIG, 0xC6);
1502         public static final VexRVMIOp VSHUFPD     = new VexRVMIOp("VSHUFPD",     P_66, M_0F,   WIG, 0xC6);
1503         public static final VexRVMIOp VINSERTF128 = new VexRVMIOp("VINSERTF128", P_66, M_0F3A, W0,  0x18, VEXOpAssertion.AVX1_256ONLY);
1504         public static final VexRVMIOp VINSERTI128 = new VexRVMIOp("VINSERTI128", P_66, M_0F3A, W0,  0x38, VEXOpAssertion.AVX2_256ONLY);
1505         // @formatter:on
1506 
1507         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op) {
1508             this(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1509         }
1510 
1511         private VexRVMIOp(String opcode, int pp, int mmmmm, int w, int op, VEXOpAssertion assertion) {
1512             super(opcode, pp, mmmmm, w, op, assertion);
1513         }
1514 
1515         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, int imm8) {
1516             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1517             assert (imm8 & 0xFF) == imm8;
1518             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1519             asm.emitByte(op);
1520             asm.emitModRM(dst, src2);
1521             asm.emitByte(imm8);
1522         }
1523 
1524         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, int imm8) {
1525             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1526             assert (imm8 & 0xFF) == imm8;
1527             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1528             asm.emitByte(op);
1529             asm.emitOperandHelper(dst, src2, 1);
1530             asm.emitByte(imm8);
1531         }
1532     }
1533 
1534     /**
1535      * VEX-encoded comparison operation with an operand order of RVMI. The immediate operand is a
1536      * comparison operator.
1537      */
1538     public static final class VexFloatCompareOp extends VexOp {
1539         // @formatter:off
1540         public static final VexFloatCompareOp VCMPPS = new VexFloatCompareOp("VCMPPS", P_,   M_0F, WIG, 0xC2);
1541         public static final VexFloatCompareOp VCMPPD = new VexFloatCompareOp("VCMPPD", P_66, M_0F, WIG, 0xC2);
1542         public static final VexFloatCompareOp VCMPSS = new VexFloatCompareOp("VCMPSS", P_F2, M_0F, WIG, 0xC2);
1543         public static final VexFloatCompareOp VCMPSD = new VexFloatCompareOp("VCMPSD", P_F2, M_0F, WIG, 0xC2);
1544         // @formatter:on
1545 
1546         public enum Predicate {
1547             EQ_OQ(0x00),
1548             LT_OS(0x01),
1549             LE_OS(0x02),
1550             UNORD_Q(0x03),
1551             NEQ_UQ(0x04),
1552             NLT_US(0x05),
1553             NLE_US(0x06),
1554             ORD_Q(0x07),
1555             EQ_UQ(0x08),
1556             NGE_US(0x09),
1557             NGT_US(0x0a),
1558             FALSE_OQ(0x0b),
1559             NEQ_OQ(0x0c),
1560             GE_OS(0x0d),
1561             GT_OS(0x0e),
1562             TRUE_UQ(0x0f),
1563             EQ_OS(0x10),
1564             LT_OQ(0x11),
1565             LE_OQ(0x12),
1566             UNORD_S(0x13),
1567             NEQ_US(0x14),
1568             NLT_UQ(0x15),
1569             NLE_UQ(0x16),
1570             ORD_S(0x17),
1571             EQ_US(0x18),
1572             NGE_UQ(0x19),
1573             NGT_UQ(0x1a),
1574             FALSE_OS(0x1b),
1575             NEQ_OS(0x1c),
1576             GE_OQ(0x1d),
1577             GT_OQ(0x1e),
1578             TRUE_US(0x1f);
1579 
1580             private int imm8;
1581 
1582             Predicate(int imm8) {
1583                 this.imm8 = imm8;
1584             }
1585 
1586             public static Predicate getPredicate(Condition condition, boolean unorderedIsTrue) {
1587                 if (unorderedIsTrue) {
1588                     switch (condition) {
1589                         case EQ:
1590                             return EQ_UQ;
1591                         case NE:
1592                             return NEQ_UQ;
1593                         case LT:
1594                             return NGE_UQ;
1595                         case LE:
1596                             return NGT_UQ;
1597                         case GT:
1598                             return NLE_UQ;
1599                         case GE:
1600                             return NLT_UQ;
1601                         default:
1602                             throw GraalError.shouldNotReachHere();
1603                     }
1604                 } else {
1605                     switch (condition) {
1606                         case EQ:
1607                             return EQ_OQ;
1608                         case NE:
1609                             return NEQ_OQ;
1610                         case LT:
1611                             return LT_OQ;
1612                         case LE:
1613                             return LE_OQ;
1614                         case GT:
1615                             return GT_OQ;
1616                         case GE:
1617                             return GE_OQ;
1618                         default:
1619                             throw GraalError.shouldNotReachHere();
1620                     }
1621                 }
1622             }
1623         }
1624 
1625         private VexFloatCompareOp(String opcode, int pp, int mmmmm, int w, int op) {
1626             super(opcode, pp, mmmmm, w, op, VEXOpAssertion.AVX1);
1627         }
1628 
1629         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, Register src2, Predicate p) {
1630             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, src2);
1631             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1632             asm.emitByte(op);
1633             asm.emitModRM(dst, src2);
1634             asm.emitByte(p.imm8);
1635         }
1636 
1637         public void emit(AMD64Assembler asm, AVXSize size, Register dst, Register src1, AMD64Address src2, Predicate p) {
1638             assert assertion.check((AMD64) asm.target.arch, size, dst, src1, null);
1639             asm.vexPrefix(dst, src1, src2, size, pp, mmmmm, w, false);
1640             asm.emitByte(op);
1641             asm.emitOperandHelper(dst, src2, 1);
1642             asm.emitByte(p.imm8);
1643         }
1644     }
1645 
1646     public final void addl(AMD64Address dst, int imm32) {
1647         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1648     }
1649 
1650     public final void addl(Register dst, int imm32) {
1651         ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1652     }
1653 
1654     public final void addl(Register dst, Register src) {
1655         ADD.rmOp.emit(this, DWORD, dst, src);
1656     }
1657 
1658     public final void addpd(Register dst, Register src) {
1659         SSEOp.ADD.emit(this, PD, dst, src);
1660     }
1661 
1662     public final void addpd(Register dst, AMD64Address src) {
1663         SSEOp.ADD.emit(this, PD, dst, src);
1664     }
1665 
1666     public final void addsd(Register dst, Register src) {
1667         SSEOp.ADD.emit(this, SD, dst, src);
1668     }
1669 
1670     public final void addsd(Register dst, AMD64Address src) {
1671         SSEOp.ADD.emit(this, SD, dst, src);
1672     }
1673 
1674     private void addrNop4() {
1675         // 4 bytes: NOP DWORD PTR [EAX+0]
1676         emitByte(0x0F);
1677         emitByte(0x1F);
1678         emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
1679         emitByte(0); // 8-bits offset (1 byte)
1680     }
1681 
1682     private void addrNop5() {
1683         // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
1684         emitByte(0x0F);
1685         emitByte(0x1F);
1686         emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
1687         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1688         emitByte(0); // 8-bits offset (1 byte)
1689     }
1690 
1691     private void addrNop7() {
1692         // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
1693         emitByte(0x0F);
1694         emitByte(0x1F);
1695         emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
1696         emitInt(0); // 32-bits offset (4 bytes)
1697     }
1698 
1699     private void addrNop8() {
1700         // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
1701         emitByte(0x0F);
1702         emitByte(0x1F);
1703         emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
1704         emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
1705         emitInt(0); // 32-bits offset (4 bytes)
1706     }
1707 
1708     public final void andl(Register dst, int imm32) {
1709         AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1710     }
1711 
1712     public final void andl(Register dst, Register src) {
1713         AND.rmOp.emit(this, DWORD, dst, src);
1714     }
1715 
1716     public final void andpd(Register dst, Register src) {
1717         SSEOp.AND.emit(this, PD, dst, src);
1718     }
1719 
1720     public final void andpd(Register dst, AMD64Address src) {
1721         SSEOp.AND.emit(this, PD, dst, src);
1722     }
1723 
1724     public final void bsfq(Register dst, Register src) {
1725         prefixq(dst, src);
1726         emitByte(0x0F);
1727         emitByte(0xBC);
1728         emitModRM(dst, src);
1729     }
1730 
1731     public final void bsrl(Register dst, Register src) {
1732         prefix(dst, src);
1733         emitByte(0x0F);
1734         emitByte(0xBD);
1735         emitModRM(dst, src);
1736     }
1737 
1738     public final void bswapl(Register reg) {
1739         prefix(reg);
1740         emitByte(0x0F);
1741         emitModRM(1, reg);
1742     }
1743 
1744     public final void cdql() {
1745         emitByte(0x99);
1746     }
1747 
1748     public final void cmovl(ConditionFlag cc, Register dst, Register src) {
1749         prefix(dst, src);
1750         emitByte(0x0F);
1751         emitByte(0x40 | cc.getValue());
1752         emitModRM(dst, src);
1753     }
1754 
1755     public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
1756         prefix(src, dst);
1757         emitByte(0x0F);
1758         emitByte(0x40 | cc.getValue());
1759         emitOperandHelper(dst, src, 0);
1760     }
1761 
1762     public final void cmpb(Register dst, Register src) {
1763         CMP.byteRmOp.emit(this, BYTE, dst, src);
1764     }
1765 
1766     public final void cmpw(Register dst, Register src) {
1767         CMP.rmOp.emit(this, WORD, dst, src);
1768     }
1769 
1770     public final void cmpl(Register dst, int imm32) {
1771         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1772     }
1773 
1774     public final void cmpl(Register dst, Register src) {
1775         CMP.rmOp.emit(this, DWORD, dst, src);
1776     }
1777 
1778     public final void cmpl(Register dst, AMD64Address src) {
1779         CMP.rmOp.emit(this, DWORD, dst, src);
1780     }
1781 
1782     public final void cmpl(AMD64Address dst, int imm32) {
1783         CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
1784     }
1785 
1786     /**
1787      * The 8-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg into
1788      * adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the compared
1789      * values were equal, and cleared otherwise.
1790      */
1791     public final void cmpxchgb(Register reg, AMD64Address adr) { // cmpxchg
1792         prefixb(adr, reg);
1793         emitByte(0x0F);
1794         emitByte(0xB0);
1795         emitOperandHelper(reg, adr, 0);
1796     }
1797 
1798     /**
1799      * The 16-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1800      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1801      * compared values were equal, and cleared otherwise.
1802      */
1803     public final void cmpxchgw(Register reg, AMD64Address adr) { // cmpxchg
1804         emitByte(0x66); // Switch to 16-bit mode.
1805         prefix(adr, reg);
1806         emitByte(0x0F);
1807         emitByte(0xB1);
1808         emitOperandHelper(reg, adr, 0);
1809     }
1810 
1811     /**
1812      * The 32-bit cmpxchg compares the value at adr with the contents of X86.rax, and stores reg
1813      * into adr if so; otherwise, the value at adr is loaded into X86.rax,. The ZF is set if the
1814      * compared values were equal, and cleared otherwise.
1815      */
1816     public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
1817         prefix(adr, reg);
1818         emitByte(0x0F);
1819         emitByte(0xB1);
1820         emitOperandHelper(reg, adr, 0);
1821     }
1822 
1823     public final void cvtsi2sdl(Register dst, Register src) {
1824         SSEOp.CVTSI2SD.emit(this, DWORD, dst, src);
1825     }
1826 
1827     public final void cvttsd2sil(Register dst, Register src) {
1828         SSEOp.CVTTSD2SI.emit(this, DWORD, dst, src);
1829     }
1830 
1831     public final void decl(AMD64Address dst) {
1832         prefix(dst);
1833         emitByte(0xFF);
1834         emitOperandHelper(1, dst, 0);
1835     }
1836 
1837     public final void divsd(Register dst, Register src) {
1838         SSEOp.DIV.emit(this, SD, dst, src);
1839     }
1840 
1841     public final void hlt() {
1842         emitByte(0xF4);
1843     }
1844 
1845     public final void imull(Register dst, Register src, int value) {
1846         if (isByte(value)) {
1847             AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
1848         } else {
1849             AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
1850         }
1851     }
1852 
1853     public final void incl(AMD64Address dst) {
1854         prefix(dst);
1855         emitByte(0xFF);
1856         emitOperandHelper(0, dst, 0);
1857     }
1858 
1859     public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
1860         int shortSize = 2;
1861         int longSize = 6;
1862         long disp = jumpTarget - position();
1863         if (!forceDisp32 && isByte(disp - shortSize)) {
1864             // 0111 tttn #8-bit disp
1865             emitByte(0x70 | cc.getValue());
1866             emitByte((int) ((disp - shortSize) & 0xFF));
1867         } else {
1868             // 0000 1111 1000 tttn #32-bit disp
1869             assert isInt(disp - longSize) : "must be 32bit offset (call4)";
1870             emitByte(0x0F);
1871             emitByte(0x80 | cc.getValue());
1872             emitInt((int) (disp - longSize));
1873         }
1874     }
1875 
1876     public final void jcc(ConditionFlag cc, Label l) {
1877         assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
1878         if (l.isBound()) {
1879             jcc(cc, l.position(), false);
1880         } else {
1881             // Note: could eliminate cond. jumps to this jump if condition
1882             // is the same however, seems to be rather unlikely case.
1883             // Note: use jccb() if label to be bound is very close to get
1884             // an 8-bit displacement
1885             l.addPatchAt(position());
1886             emitByte(0x0F);
1887             emitByte(0x80 | cc.getValue());
1888             emitInt(0);
1889         }
1890 
1891     }
1892 
1893     public final void jccb(ConditionFlag cc, Label l) {
1894         if (l.isBound()) {
1895             int shortSize = 2;
1896             int entry = l.position();
1897             assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
1898             long disp = entry - position();
1899             // 0111 tttn #8-bit disp
1900             emitByte(0x70 | cc.getValue());
1901             emitByte((int) ((disp - shortSize) & 0xFF));
1902         } else {
1903             l.addPatchAt(position());
1904             emitByte(0x70 | cc.getValue());
1905             emitByte(0);
1906         }
1907     }
1908 
1909     public final void jmp(int jumpTarget, boolean forceDisp32) {
1910         int shortSize = 2;
1911         int longSize = 5;
1912         long disp = jumpTarget - position();
1913         if (!forceDisp32 && isByte(disp - shortSize)) {
1914             emitByte(0xEB);
1915             emitByte((int) ((disp - shortSize) & 0xFF));
1916         } else {
1917             emitByte(0xE9);
1918             emitInt((int) (disp - longSize));
1919         }
1920     }
1921 
1922     @Override
1923     public final void jmp(Label l) {
1924         if (l.isBound()) {
1925             jmp(l.position(), false);
1926         } else {
1927             // By default, forward jumps are always 32-bit displacements, since
1928             // we can't yet know where the label will be bound. If you're sure that
1929             // the forward jump will not run beyond 256 bytes, use jmpb to
1930             // force an 8-bit displacement.
1931 
1932             l.addPatchAt(position());
1933             emitByte(0xE9);
1934             emitInt(0);
1935         }
1936     }
1937 
1938     public final void jmp(Register entry) {
1939         prefix(entry);
1940         emitByte(0xFF);
1941         emitModRM(4, entry);
1942     }
1943 
1944     public final void jmp(AMD64Address adr) {
1945         prefix(adr);
1946         emitByte(0xFF);
1947         emitOperandHelper(AMD64.rsp, adr, 0);
1948     }
1949 
1950     public final void jmpb(Label l) {
1951         if (l.isBound()) {
1952             int shortSize = 2;
1953             int entry = l.position();
1954             assert isByte((entry - position()) + shortSize) : "Dispacement too large for a short jmp";
1955             long offs = entry - position();
1956             emitByte(0xEB);
1957             emitByte((int) ((offs - shortSize) & 0xFF));
1958         } else {
1959 
1960             l.addPatchAt(position());
1961             emitByte(0xEB);
1962             emitByte(0);
1963         }
1964     }
1965 
1966     public final void lead(Register dst, AMD64Address src) {
1967         prefix(src, dst);
1968         emitByte(0x8D);
1969         emitOperandHelper(dst, src, 0);
1970     }
1971 
1972     public final void leaq(Register dst, AMD64Address src) {
1973         prefixq(src, dst);
1974         emitByte(0x8D);
1975         emitOperandHelper(dst, src, 0);
1976     }
1977 
1978     public final void leave() {
1979         emitByte(0xC9);
1980     }
1981 
1982     public final void lock() {
1983         emitByte(0xF0);
1984     }
1985 
1986     public final void movapd(Register dst, Register src) {
1987         assert inRC(XMM, dst) && inRC(XMM, src);
1988         simdPrefix(dst, Register.None, src, PD, P_0F, false);
1989         emitByte(0x28);
1990         emitModRM(dst, src);
1991     }
1992 
1993     public final void movaps(Register dst, Register src) {
1994         assert inRC(XMM, dst) && inRC(XMM, src);
1995         simdPrefix(dst, Register.None, src, PS, P_0F, false);
1996         emitByte(0x28);
1997         emitModRM(dst, src);
1998     }
1999 
2000     public final void movb(AMD64Address dst, int imm8) {
2001         prefix(dst);
2002         emitByte(0xC6);
2003         emitOperandHelper(0, dst, 1);
2004         emitByte(imm8);
2005     }
2006 
2007     public final void movb(AMD64Address dst, Register src) {
2008         assert inRC(CPU, src) : "must have byte register";
2009         prefixb(dst, src);
2010         emitByte(0x88);
2011         emitOperandHelper(src, dst, 0);
2012     }
2013 
2014     public final void movl(Register dst, int imm32) {
2015         movl(dst, imm32, false);
2016     }
2017 
2018     public final void movl(Register dst, int imm32, boolean annotateImm) {
2019         int insnPos = position();
2020         prefix(dst);
2021         emitByte(0xB8 + encode(dst));
2022         int immPos = position();
2023         emitInt(imm32);
2024         int nextInsnPos = position();
2025         if (annotateImm && codePatchingAnnotationConsumer != null) {
2026             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
2027         }
2028     }
2029 
2030     public final void movl(Register dst, Register src) {
2031         prefix(dst, src);
2032         emitByte(0x8B);
2033         emitModRM(dst, src);
2034     }
2035 
2036     public final void movl(Register dst, AMD64Address src) {
2037         prefix(src, dst);
2038         emitByte(0x8B);
2039         emitOperandHelper(dst, src, 0);
2040     }
2041 
2042     /**
2043      * @param wide use 4 byte encoding for displacements that would normally fit in a byte
2044      */
2045     public final void movl(Register dst, AMD64Address src, boolean wide) {
2046         prefix(src, dst);
2047         emitByte(0x8B);
2048         emitOperandHelper(dst, src, wide, 0);
2049     }
2050 
2051     public final void movl(AMD64Address dst, int imm32) {
2052         prefix(dst);
2053         emitByte(0xC7);
2054         emitOperandHelper(0, dst, 4);
2055         emitInt(imm32);
2056     }
2057 
2058     public final void movl(AMD64Address dst, Register src) {
2059         prefix(dst, src);
2060         emitByte(0x89);
2061         emitOperandHelper(src, dst, 0);
2062     }
2063 
2064     /**
2065      * New CPUs require use of movsd and movss to avoid partial register stall when loading from
2066      * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
2067      * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
2068      * {@link AMD64MacroAssembler#movflt(Register, Register)}.
2069      */
2070     public final void movlpd(Register dst, AMD64Address src) {
2071         assert inRC(XMM, dst);
2072         simdPrefix(dst, dst, src, PD, P_0F, false);
2073         emitByte(0x12);
2074         emitOperandHelper(dst, src, 0);
2075     }
2076 
2077     public final void movlhps(Register dst, Register src) {
2078         assert inRC(XMM, dst) && inRC(XMM, src);
2079         simdPrefix(dst, src, src, PS, P_0F, false);
2080         emitByte(0x16);
2081         emitModRM(dst, src);
2082     }
2083 
2084     public final void movq(Register dst, AMD64Address src) {
2085         movq(dst, src, false);
2086     }
2087 
2088     public final void movq(Register dst, AMD64Address src, boolean force4BytesDisplacement) {
2089         if (inRC(XMM, dst)) {
2090             // Insn: MOVQ xmm, r/m64
2091             // Code: F3 0F 7E /r
2092             // An alternative instruction would be 66 REX.W 0F 6E /r. We prefer the REX.W free
2093             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2094             // when applicable.
2095             simdPrefix(dst, Register.None, src, SS, P_0F, false);
2096             emitByte(0x7E);
2097             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2098         } else {
2099             // gpr version of movq
2100             prefixq(src, dst);
2101             emitByte(0x8B);
2102             emitOperandHelper(dst, src, force4BytesDisplacement, 0);
2103         }
2104     }
2105 
2106     public final void movq(Register dst, Register src) {
2107         assert inRC(CPU, dst) && inRC(CPU, src);
2108         prefixq(dst, src);
2109         emitByte(0x8B);
2110         emitModRM(dst, src);
2111     }
2112 
2113     public final void movq(AMD64Address dst, Register src) {
2114         if (inRC(XMM, src)) {
2115             // Insn: MOVQ r/m64, xmm
2116             // Code: 66 0F D6 /r
2117             // An alternative instruction would be 66 REX.W 0F 7E /r. We prefer the REX.W free
2118             // format, because it would allow us to emit 2-bytes-prefixed vex-encoding instruction
2119             // when applicable.
2120             simdPrefix(src, Register.None, dst, PD, P_0F, false);
2121             emitByte(0xD6);
2122             emitOperandHelper(src, dst, 0);
2123         } else {
2124             // gpr version of movq
2125             prefixq(dst, src);
2126             emitByte(0x89);
2127             emitOperandHelper(src, dst, 0);
2128         }
2129     }
2130 
2131     public final void movsbl(Register dst, AMD64Address src) {
2132         prefix(src, dst);
2133         emitByte(0x0F);
2134         emitByte(0xBE);
2135         emitOperandHelper(dst, src, 0);
2136     }
2137 
2138     public final void movsbl(Register dst, Register src) {
2139         prefix(dst, false, src, true);
2140         emitByte(0x0F);
2141         emitByte(0xBE);
2142         emitModRM(dst, src);
2143     }
2144 
2145     public final void movsbq(Register dst, AMD64Address src) {
2146         prefixq(src, dst);
2147         emitByte(0x0F);
2148         emitByte(0xBE);
2149         emitOperandHelper(dst, src, 0);
2150     }
2151 
2152     public final void movsbq(Register dst, Register src) {
2153         prefixq(dst, src);
2154         emitByte(0x0F);
2155         emitByte(0xBE);
2156         emitModRM(dst, src);
2157     }
2158 
2159     public final void movsd(Register dst, Register src) {
2160         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2161     }
2162 
2163     public final void movsd(Register dst, AMD64Address src) {
2164         AMD64RMOp.MOVSD.emit(this, SD, dst, src);
2165     }
2166 
2167     public final void movsd(AMD64Address dst, Register src) {
2168         AMD64MROp.MOVSD.emit(this, SD, dst, src);
2169     }
2170 
2171     public final void movss(Register dst, Register src) {
2172         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2173     }
2174 
2175     public final void movss(Register dst, AMD64Address src) {
2176         AMD64RMOp.MOVSS.emit(this, SS, dst, src);
2177     }
2178 
2179     public final void movss(AMD64Address dst, Register src) {
2180         AMD64MROp.MOVSS.emit(this, SS, dst, src);
2181     }
2182 
2183     public final void mulpd(Register dst, Register src) {
2184         SSEOp.MUL.emit(this, PD, dst, src);
2185     }
2186 
2187     public final void mulpd(Register dst, AMD64Address src) {
2188         SSEOp.MUL.emit(this, PD, dst, src);
2189     }
2190 
2191     public final void mulsd(Register dst, Register src) {
2192         SSEOp.MUL.emit(this, SD, dst, src);
2193     }
2194 
2195     public final void mulsd(Register dst, AMD64Address src) {
2196         SSEOp.MUL.emit(this, SD, dst, src);
2197     }
2198 
2199     public final void mulss(Register dst, Register src) {
2200         SSEOp.MUL.emit(this, SS, dst, src);
2201     }
2202 
2203     public final void movswl(Register dst, AMD64Address src) {
2204         AMD64RMOp.MOVSX.emit(this, DWORD, dst, src);
2205     }
2206 
2207     public final void movswq(Register dst, AMD64Address src) {
2208         AMD64RMOp.MOVSX.emit(this, QWORD, dst, src);
2209     }
2210 
2211     public final void movw(AMD64Address dst, int imm16) {
2212         emitByte(0x66); // switch to 16-bit mode
2213         prefix(dst);
2214         emitByte(0xC7);
2215         emitOperandHelper(0, dst, 2);
2216         emitShort(imm16);
2217     }
2218 
2219     public final void movw(AMD64Address dst, Register src) {
2220         emitByte(0x66);
2221         prefix(dst, src);
2222         emitByte(0x89);
2223         emitOperandHelper(src, dst, 0);
2224     }
2225 
2226     public final void movw(Register dst, AMD64Address src) {
2227         emitByte(0x66);
2228         prefix(src, dst);
2229         emitByte(0x8B);
2230         emitOperandHelper(dst, src, 0);
2231     }
2232 
2233     public final void movzbl(Register dst, AMD64Address src) {
2234         prefix(src, dst);
2235         emitByte(0x0F);
2236         emitByte(0xB6);
2237         emitOperandHelper(dst, src, 0);
2238     }
2239 
2240     public final void movzbl(Register dst, Register src) {
2241         AMD64RMOp.MOVZXB.emit(this, DWORD, dst, src);
2242     }
2243 
2244     public final void movzbq(Register dst, Register src) {
2245         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2246     }
2247 
2248     public final void movzbq(Register dst, AMD64Address src) {
2249         AMD64RMOp.MOVZXB.emit(this, QWORD, dst, src);
2250     }
2251 
2252     public final void movzwl(Register dst, AMD64Address src) {
2253         AMD64RMOp.MOVZX.emit(this, DWORD, dst, src);
2254     }
2255 
2256     public final void movzwq(Register dst, AMD64Address src) {
2257         AMD64RMOp.MOVZX.emit(this, QWORD, dst, src);
2258     }
2259 
2260     public final void negl(Register dst) {
2261         NEG.emit(this, DWORD, dst);
2262     }
2263 
2264     public final void notl(Register dst) {
2265         NOT.emit(this, DWORD, dst);
2266     }
2267 
2268     public final void notq(Register dst) {
2269         NOT.emit(this, QWORD, dst);
2270     }
2271 
2272     @Override
2273     public final void ensureUniquePC() {
2274         nop();
2275     }
2276 
2277     public final void nop() {
2278         nop(1);
2279     }
2280 
2281     public void nop(int count) {
2282         int i = count;
2283         if (UseNormalNop) {
2284             assert i > 0 : " ";
2285             // The fancy nops aren't currently recognized by debuggers making it a
2286             // pain to disassemble code while debugging. If assert are on clearly
2287             // speed is not an issue so simply use the single byte traditional nop
2288             // to do alignment.
2289 
2290             for (; i > 0; i--) {
2291                 emitByte(0x90);
2292             }
2293             return;
2294         }
2295 
2296         if (UseAddressNop) {
2297             //
2298             // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
2299             // 1: 0x90
2300             // 2: 0x66 0x90
2301             // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2302             // 4: 0x0F 0x1F 0x40 0x00
2303             // 5: 0x0F 0x1F 0x44 0x00 0x00
2304             // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2305             // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2306             // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2307             // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2308             // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2309             // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2310 
2311             // The rest coding is AMD specific - use consecutive Address nops
2312 
2313             // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2314             // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2315             // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2316             // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2317             // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2318             // Size prefixes (0x66) are added for larger sizes
2319 
2320             while (i >= 22) {
2321                 i -= 11;
2322                 emitByte(0x66); // size prefix
2323                 emitByte(0x66); // size prefix
2324                 emitByte(0x66); // size prefix
2325                 addrNop8();
2326             }
2327             // Generate first nop for size between 21-12
2328             switch (i) {
2329                 case 21:
2330                     i -= 11;
2331                     emitByte(0x66); // size prefix
2332                     emitByte(0x66); // size prefix
2333                     emitByte(0x66); // size prefix
2334                     addrNop8();
2335                     break;
2336                 case 20:
2337                 case 19:
2338                     i -= 10;
2339                     emitByte(0x66); // size prefix
2340                     emitByte(0x66); // size prefix
2341                     addrNop8();
2342                     break;
2343                 case 18:
2344                 case 17:
2345                     i -= 9;
2346                     emitByte(0x66); // size prefix
2347                     addrNop8();
2348                     break;
2349                 case 16:
2350                 case 15:
2351                     i -= 8;
2352                     addrNop8();
2353                     break;
2354                 case 14:
2355                 case 13:
2356                     i -= 7;
2357                     addrNop7();
2358                     break;
2359                 case 12:
2360                     i -= 6;
2361                     emitByte(0x66); // size prefix
2362                     addrNop5();
2363                     break;
2364                 default:
2365                     assert i < 12;
2366             }
2367 
2368             // Generate second nop for size between 11-1
2369             switch (i) {
2370                 case 11:
2371                     emitByte(0x66); // size prefix
2372                     emitByte(0x66); // size prefix
2373                     emitByte(0x66); // size prefix
2374                     addrNop8();
2375                     break;
2376                 case 10:
2377                     emitByte(0x66); // size prefix
2378                     emitByte(0x66); // size prefix
2379                     addrNop8();
2380                     break;
2381                 case 9:
2382                     emitByte(0x66); // size prefix
2383                     addrNop8();
2384                     break;
2385                 case 8:
2386                     addrNop8();
2387                     break;
2388                 case 7:
2389                     addrNop7();
2390                     break;
2391                 case 6:
2392                     emitByte(0x66); // size prefix
2393                     addrNop5();
2394                     break;
2395                 case 5:
2396                     addrNop5();
2397                     break;
2398                 case 4:
2399                     addrNop4();
2400                     break;
2401                 case 3:
2402                     // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2403                     emitByte(0x66); // size prefix
2404                     emitByte(0x66); // size prefix
2405                     emitByte(0x90); // nop
2406                     break;
2407                 case 2:
2408                     emitByte(0x66); // size prefix
2409                     emitByte(0x90); // nop
2410                     break;
2411                 case 1:
2412                     emitByte(0x90); // nop
2413                     break;
2414                 default:
2415                     assert i == 0;
2416             }
2417             return;
2418         }
2419 
2420         // Using nops with size prefixes "0x66 0x90".
2421         // From AMD Optimization Guide:
2422         // 1: 0x90
2423         // 2: 0x66 0x90
2424         // 3: 0x66 0x66 0x90
2425         // 4: 0x66 0x66 0x66 0x90
2426         // 5: 0x66 0x66 0x90 0x66 0x90
2427         // 6: 0x66 0x66 0x90 0x66 0x66 0x90
2428         // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2429         // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2430         // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2431         // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2432         //
2433         while (i > 12) {
2434             i -= 4;
2435             emitByte(0x66); // size prefix
2436             emitByte(0x66);
2437             emitByte(0x66);
2438             emitByte(0x90); // nop
2439         }
2440         // 1 - 12 nops
2441         if (i > 8) {
2442             if (i > 9) {
2443                 i -= 1;
2444                 emitByte(0x66);
2445             }
2446             i -= 3;
2447             emitByte(0x66);
2448             emitByte(0x66);
2449             emitByte(0x90);
2450         }
2451         // 1 - 8 nops
2452         if (i > 4) {
2453             if (i > 6) {
2454                 i -= 1;
2455                 emitByte(0x66);
2456             }
2457             i -= 3;
2458             emitByte(0x66);
2459             emitByte(0x66);
2460             emitByte(0x90);
2461         }
2462         switch (i) {
2463             case 4:
2464                 emitByte(0x66);
2465                 emitByte(0x66);
2466                 emitByte(0x66);
2467                 emitByte(0x90);
2468                 break;
2469             case 3:
2470                 emitByte(0x66);
2471                 emitByte(0x66);
2472                 emitByte(0x90);
2473                 break;
2474             case 2:
2475                 emitByte(0x66);
2476                 emitByte(0x90);
2477                 break;
2478             case 1:
2479                 emitByte(0x90);
2480                 break;
2481             default:
2482                 assert i == 0;
2483         }
2484     }
2485 
2486     public final void orl(Register dst, Register src) {
2487         OR.rmOp.emit(this, DWORD, dst, src);
2488     }
2489 
2490     public final void orl(Register dst, int imm32) {
2491         OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2492     }
2493 
2494     // Insn: VPACKUSWB xmm1, xmm2, xmm3/m128
2495     // -----
2496     // Insn: VPACKUSWB xmm1, xmm1, xmm2
2497 
2498     public final void packuswb(Register dst, Register src) {
2499         assert inRC(XMM, dst) && inRC(XMM, src);
2500         // Code: VEX.NDS.128.66.0F.WIG 67 /r
2501         simdPrefix(dst, dst, src, PD, P_0F, false);
2502         emitByte(0x67);
2503         emitModRM(dst, src);
2504     }
2505 
2506     public final void pop(Register dst) {
2507         prefix(dst);
2508         emitByte(0x58 + encode(dst));
2509     }
2510 
2511     public void popfq() {
2512         emitByte(0x9D);
2513     }
2514 
2515     public final void ptest(Register dst, Register src) {
2516         assert supports(CPUFeature.SSE4_1);
2517         assert inRC(XMM, dst) && inRC(XMM, src);
2518         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2519         emitByte(0x17);
2520         emitModRM(dst, src);
2521     }
2522 
2523     public final void pcmpeqb(Register dst, Register src) {
2524         assert supports(CPUFeature.SSE2);
2525         assert inRC(XMM, dst) && inRC(XMM, src);
2526         simdPrefix(dst, dst, src, PD, P_0F, false);
2527         emitByte(0x74);
2528         emitModRM(dst, src);
2529     }
2530 
2531     public final void pcmpeqw(Register dst, Register src) {
2532         assert supports(CPUFeature.SSE2);
2533         assert inRC(XMM, dst) && inRC(XMM, src);
2534         simdPrefix(dst, dst, src, PD, P_0F, false);
2535         emitByte(0x75);
2536         emitModRM(dst, src);
2537     }
2538 
2539     public final void pcmpeqd(Register dst, Register src) {
2540         assert supports(CPUFeature.SSE2);
2541         assert dst.getRegisterCategory().equals(XMM) && src.getRegisterCategory().equals(XMM);
2542         simdPrefix(dst, dst, src, PD, P_0F, false);
2543         emitByte(0x76);
2544         emitModRM(dst, src);
2545     }
2546 
2547     public final void pcmpestri(Register dst, AMD64Address src, int imm8) {
2548         assert supports(CPUFeature.SSE4_2);
2549         assert inRC(XMM, dst);
2550         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2551         emitByte(0x61);
2552         emitOperandHelper(dst, src, 0);
2553         emitByte(imm8);
2554     }
2555 
2556     public final void pcmpestri(Register dst, Register src, int imm8) {
2557         assert supports(CPUFeature.SSE4_2);
2558         assert inRC(XMM, dst) && inRC(XMM, src);
2559         simdPrefix(dst, Register.None, src, PD, P_0F3A, false);
2560         emitByte(0x61);
2561         emitModRM(dst, src);
2562         emitByte(imm8);
2563     }
2564 
2565     public final void pmovmskb(Register dst, Register src) {
2566         assert supports(CPUFeature.SSE2);
2567         assert inRC(CPU, dst) && inRC(XMM, src);
2568         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2569         emitByte(0xD7);
2570         emitModRM(dst, src);
2571     }
2572 
2573     private void pmovSZx(Register dst, AMD64Address src, int op) {
2574         assert supports(CPUFeature.SSE4_1);
2575         assert inRC(XMM, dst);
2576         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2577         emitByte(op);
2578         emitOperandHelper(dst, src, 0);
2579     }
2580 
2581     public final void pmovsxbw(Register dst, AMD64Address src) {
2582         pmovSZx(dst, src, 0x20);
2583     }
2584 
2585     public final void pmovsxbd(Register dst, AMD64Address src) {
2586         pmovSZx(dst, src, 0x21);
2587     }
2588 
2589     public final void pmovsxbq(Register dst, AMD64Address src) {
2590         pmovSZx(dst, src, 0x22);
2591     }
2592 
2593     public final void pmovsxwd(Register dst, AMD64Address src) {
2594         pmovSZx(dst, src, 0x23);
2595     }
2596 
2597     public final void pmovsxwq(Register dst, AMD64Address src) {
2598         pmovSZx(dst, src, 0x24);
2599     }
2600 
2601     public final void pmovsxdq(Register dst, AMD64Address src) {
2602         pmovSZx(dst, src, 0x25);
2603     }
2604 
2605     // Insn: VPMOVZXBW xmm1, xmm2/m64
2606     public final void pmovzxbw(Register dst, AMD64Address src) {
2607         pmovSZx(dst, src, 0x30);
2608     }
2609 
2610     public final void pmovzxbd(Register dst, AMD64Address src) {
2611         pmovSZx(dst, src, 0x31);
2612     }
2613 
2614     public final void pmovzxbq(Register dst, AMD64Address src) {
2615         pmovSZx(dst, src, 0x32);
2616     }
2617 
2618     public final void pmovzxwd(Register dst, AMD64Address src) {
2619         pmovSZx(dst, src, 0x33);
2620     }
2621 
2622     public final void pmovzxwq(Register dst, AMD64Address src) {
2623         pmovSZx(dst, src, 0x34);
2624     }
2625 
2626     public final void pmovzxdq(Register dst, AMD64Address src) {
2627         pmovSZx(dst, src, 0x35);
2628     }
2629 
2630     public final void pmovzxbw(Register dst, Register src) {
2631         assert supports(CPUFeature.SSE4_1);
2632         assert inRC(XMM, dst) && inRC(XMM, src);
2633         simdPrefix(dst, Register.None, src, PD, P_0F38, false);
2634         emitByte(0x30);
2635         emitModRM(dst, src);
2636     }
2637 
2638     public final void push(Register src) {
2639         prefix(src);
2640         emitByte(0x50 + encode(src));
2641     }
2642 
2643     public void pushfq() {
2644         emitByte(0x9c);
2645     }
2646 
2647     public final void paddd(Register dst, Register src) {
2648         assert inRC(XMM, dst) && inRC(XMM, src);
2649         simdPrefix(dst, dst, src, PD, P_0F, false);
2650         emitByte(0xFE);
2651         emitModRM(dst, src);
2652     }
2653 
2654     public final void paddq(Register dst, Register src) {
2655         assert inRC(XMM, dst) && inRC(XMM, src);
2656         simdPrefix(dst, dst, src, PD, P_0F, false);
2657         emitByte(0xD4);
2658         emitModRM(dst, src);
2659     }
2660 
2661     public final void pextrw(Register dst, Register src, int imm8) {
2662         assert inRC(CPU, dst) && inRC(XMM, src);
2663         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2664         emitByte(0xC5);
2665         emitModRM(dst, src);
2666         emitByte(imm8);
2667     }
2668 
2669     public final void pinsrw(Register dst, Register src, int imm8) {
2670         assert inRC(XMM, dst) && inRC(CPU, src);
2671         simdPrefix(dst, dst, src, PD, P_0F, false);
2672         emitByte(0xC4);
2673         emitModRM(dst, src);
2674         emitByte(imm8);
2675     }
2676 
2677     public final void por(Register dst, Register src) {
2678         assert inRC(XMM, dst) && inRC(XMM, src);
2679         simdPrefix(dst, dst, src, PD, P_0F, false);
2680         emitByte(0xEB);
2681         emitModRM(dst, src);
2682     }
2683 
2684     public final void pand(Register dst, Register src) {
2685         assert inRC(XMM, dst) && inRC(XMM, src);
2686         simdPrefix(dst, dst, src, PD, P_0F, false);
2687         emitByte(0xDB);
2688         emitModRM(dst, src);
2689     }
2690 
2691     public final void pxor(Register dst, Register src) {
2692         assert inRC(XMM, dst) && inRC(XMM, src);
2693         simdPrefix(dst, dst, src, PD, P_0F, false);
2694         emitByte(0xEF);
2695         emitModRM(dst, src);
2696     }
2697 
2698     public final void pslld(Register dst, int imm8) {
2699         assert isUByte(imm8) : "invalid value";
2700         assert inRC(XMM, dst);
2701         // XMM6 is for /6 encoding: 66 0F 72 /6 ib
2702         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2703         emitByte(0x72);
2704         emitModRM(6, dst);
2705         emitByte(imm8 & 0xFF);
2706     }
2707 
2708     public final void psllq(Register dst, Register shift) {
2709         assert inRC(XMM, dst) && inRC(XMM, shift);
2710         simdPrefix(dst, dst, shift, PD, P_0F, false);
2711         emitByte(0xF3);
2712         emitModRM(dst, shift);
2713     }
2714 
2715     public final void psllq(Register dst, int imm8) {
2716         assert isUByte(imm8) : "invalid value";
2717         assert inRC(XMM, dst);
2718         // XMM6 is for /6 encoding: 66 0F 73 /6 ib
2719         simdPrefix(AMD64.xmm6, dst, dst, PD, P_0F, false);
2720         emitByte(0x73);
2721         emitModRM(6, dst);
2722         emitByte(imm8);
2723     }
2724 
2725     public final void psrad(Register dst, int imm8) {
2726         assert isUByte(imm8) : "invalid value";
2727         assert inRC(XMM, dst);
2728         // XMM4 is for /4 encoding: 66 0F 72 /4 ib
2729         simdPrefix(AMD64.xmm4, dst, dst, PD, P_0F, false);
2730         emitByte(0x72);
2731         emitModRM(4, dst);
2732         emitByte(imm8);
2733     }
2734 
2735     public final void psrld(Register dst, int imm8) {
2736         assert isUByte(imm8) : "invalid value";
2737         assert inRC(XMM, dst);
2738         // XMM2 is for /2 encoding: 66 0F 72 /2 ib
2739         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2740         emitByte(0x72);
2741         emitModRM(2, dst);
2742         emitByte(imm8);
2743     }
2744 
2745     public final void psrlq(Register dst, int imm8) {
2746         assert isUByte(imm8) : "invalid value";
2747         assert inRC(XMM, dst);
2748         // XMM2 is for /2 encoding: 66 0F 73 /2 ib
2749         simdPrefix(AMD64.xmm2, dst, dst, PD, P_0F, false);
2750         emitByte(0x73);
2751         emitModRM(2, dst);
2752         emitByte(imm8);
2753     }
2754 
2755     public final void psrldq(Register dst, int imm8) {
2756         assert isUByte(imm8) : "invalid value";
2757         assert inRC(XMM, dst);
2758         simdPrefix(AMD64.xmm3, dst, dst, PD, P_0F, false);
2759         emitByte(0x73);
2760         emitModRM(3, dst);
2761         emitByte(imm8);
2762     }
2763 
2764     public final void pshufb(Register dst, Register src) {
2765         assert supports(CPUFeature.SSSE3);
2766         assert inRC(XMM, dst) && inRC(XMM, src);
2767         simdPrefix(dst, dst, src, PD, P_0F38, false);
2768         emitByte(0x00);
2769         emitModRM(dst, src);
2770     }
2771 
2772     public final void pshuflw(Register dst, Register src, int imm8) {
2773         assert supports(CPUFeature.SSE2);
2774         assert isUByte(imm8) : "invalid value";
2775         assert inRC(XMM, dst) && inRC(XMM, src);
2776         simdPrefix(dst, Register.None, src, SD, P_0F, false);
2777         emitByte(0x70);
2778         emitModRM(dst, src);
2779         emitByte(imm8);
2780     }
2781 
2782     public final void pshufd(Register dst, Register src, int imm8) {
2783         assert isUByte(imm8) : "invalid value";
2784         assert inRC(XMM, dst) && inRC(XMM, src);
2785         simdPrefix(dst, Register.None, src, PD, P_0F, false);
2786         emitByte(0x70);
2787         emitModRM(dst, src);
2788         emitByte(imm8);
2789     }
2790 
2791     public final void psubd(Register dst, Register src) {
2792         assert inRC(XMM, dst) && inRC(XMM, src);
2793         simdPrefix(dst, dst, src, PD, P_0F, false);
2794         emitByte(0xFA);
2795         emitModRM(dst, src);
2796     }
2797 
2798     public final void punpcklbw(Register dst, Register src) {
2799         assert supports(CPUFeature.SSE2);
2800         assert inRC(XMM, dst) && inRC(XMM, src);
2801         simdPrefix(dst, dst, src, PD, P_0F, false);
2802         emitByte(0x60);
2803         emitModRM(dst, src);
2804     }
2805 
2806     public final void rcpps(Register dst, Register src) {
2807         assert inRC(XMM, dst) && inRC(XMM, src);
2808         simdPrefix(dst, Register.None, src, PS, P_0F, false);
2809         emitByte(0x53);
2810         emitModRM(dst, src);
2811     }
2812 
2813     public final void ret(int imm16) {
2814         if (imm16 == 0) {
2815             emitByte(0xC3);
2816         } else {
2817             emitByte(0xC2);
2818             emitShort(imm16);
2819         }
2820     }
2821 
2822     public final void sarl(Register dst, int imm8) {
2823         prefix(dst);
2824         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2825         if (imm8 == 1) {
2826             emitByte(0xD1);
2827             emitModRM(7, dst);
2828         } else {
2829             emitByte(0xC1);
2830             emitModRM(7, dst);
2831             emitByte(imm8);
2832         }
2833     }
2834 
2835     public final void shll(Register dst, int imm8) {
2836         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2837         prefix(dst);
2838         if (imm8 == 1) {
2839             emitByte(0xD1);
2840             emitModRM(4, dst);
2841         } else {
2842             emitByte(0xC1);
2843             emitModRM(4, dst);
2844             emitByte(imm8);
2845         }
2846     }
2847 
2848     public final void shll(Register dst) {
2849         // Multiply dst by 2, CL times.
2850         prefix(dst);
2851         emitByte(0xD3);
2852         emitModRM(4, dst);
2853     }
2854 
2855     // Insn: SHLX r32a, r/m32, r32b
2856 
2857     public final void shlxl(Register dst, Register src1, Register src2) {
2858         VexGeneralPurposeRMVOp.SHLX.emit(this, AVXSize.DWORD, dst, src1, src2);
2859     }
2860 
2861     public final void shrl(Register dst, int imm8) {
2862         assert isShiftCount(imm8 >> 1) : "illegal shift count";
2863         prefix(dst);
2864         emitByte(0xC1);
2865         emitModRM(5, dst);
2866         emitByte(imm8);
2867     }
2868 
2869     public final void shrl(Register dst) {
2870         // Unsigned divide dst by 2, CL times.
2871         prefix(dst);
2872         emitByte(0xD3);
2873         emitModRM(5, dst);
2874     }
2875 
2876     public final void subl(AMD64Address dst, int imm32) {
2877         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2878     }
2879 
2880     public final void subl(Register dst, int imm32) {
2881         SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
2882     }
2883 
2884     public final void subl(Register dst, Register src) {
2885         SUB.rmOp.emit(this, DWORD, dst, src);
2886     }
2887 
2888     public final void subpd(Register dst, Register src) {
2889         SSEOp.SUB.emit(this, PD, dst, src);
2890     }
2891 
2892     public final void subsd(Register dst, Register src) {
2893         SSEOp.SUB.emit(this, SD, dst, src);
2894     }
2895 
2896     public final void subsd(Register dst, AMD64Address src) {
2897         SSEOp.SUB.emit(this, SD, dst, src);
2898     }
2899 
2900     public final void testl(Register dst, int imm32) {
2901         // not using emitArith because test
2902         // doesn't support sign-extension of
2903         // 8bit operands
2904         if (dst.encoding == 0) {
2905             emitByte(0xA9);
2906         } else {
2907             prefix(dst);
2908             emitByte(0xF7);
2909             emitModRM(0, dst);
2910         }
2911         emitInt(imm32);
2912     }
2913 
2914     public final void testl(Register dst, Register src) {
2915         prefix(dst, src);
2916         emitByte(0x85);
2917         emitModRM(dst, src);
2918     }
2919 
2920     public final void testl(Register dst, AMD64Address src) {
2921         prefix(src, dst);
2922         emitByte(0x85);
2923         emitOperandHelper(dst, src, 0);
2924     }
2925 
2926     public final void unpckhpd(Register dst, Register src) {
2927         assert inRC(XMM, dst) && inRC(XMM, src);
2928         simdPrefix(dst, dst, src, PD, P_0F, false);
2929         emitByte(0x15);
2930         emitModRM(dst, src);
2931     }
2932 
2933     public final void unpcklpd(Register dst, Register src) {
2934         assert inRC(XMM, dst) && inRC(XMM, src);
2935         simdPrefix(dst, dst, src, PD, P_0F, false);
2936         emitByte(0x14);
2937         emitModRM(dst, src);
2938     }
2939 
2940     public final void xorl(Register dst, Register src) {
2941         XOR.rmOp.emit(this, DWORD, dst, src);
2942     }
2943 
2944     public final void xorq(Register dst, Register src) {
2945         XOR.rmOp.emit(this, QWORD, dst, src);
2946     }
2947 
2948     public final void xorpd(Register dst, Register src) {
2949         SSEOp.XOR.emit(this, PD, dst, src);
2950     }
2951 
2952     public final void xorps(Register dst, Register src) {
2953         SSEOp.XOR.emit(this, PS, dst, src);
2954     }
2955 
2956     protected final void decl(Register dst) {
2957         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
2958         prefix(dst);
2959         emitByte(0xFF);
2960         emitModRM(1, dst);
2961     }
2962 
2963     protected final void incl(Register dst) {
2964         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
2965         prefix(dst);
2966         emitByte(0xFF);
2967         emitModRM(0, dst);
2968     }
2969 
2970     public final void addq(Register dst, int imm32) {
2971         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2972     }
2973 
2974     public final void addq(AMD64Address dst, int imm32) {
2975         ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2976     }
2977 
2978     public final void addq(Register dst, Register src) {
2979         ADD.rmOp.emit(this, QWORD, dst, src);
2980     }
2981 
2982     public final void addq(AMD64Address dst, Register src) {
2983         ADD.mrOp.emit(this, QWORD, dst, src);
2984     }
2985 
2986     public final void andq(Register dst, int imm32) {
2987         AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
2988     }
2989 
2990     public final void bsrq(Register dst, Register src) {
2991         prefixq(dst, src);
2992         emitByte(0x0F);
2993         emitByte(0xBD);
2994         emitModRM(dst, src);
2995     }
2996 
2997     public final void bswapq(Register reg) {
2998         prefixq(reg);
2999         emitByte(0x0F);
3000         emitByte(0xC8 + encode(reg));
3001     }
3002 
3003     public final void cdqq() {
3004         rexw();
3005         emitByte(0x99);
3006     }
3007 
3008     public final void cmovq(ConditionFlag cc, Register dst, Register src) {
3009         prefixq(dst, src);
3010         emitByte(0x0F);
3011         emitByte(0x40 | cc.getValue());
3012         emitModRM(dst, src);
3013     }
3014 
3015     public final void setb(ConditionFlag cc, Register dst) {
3016         prefix(dst, true);
3017         emitByte(0x0F);
3018         emitByte(0x90 | cc.getValue());
3019         emitModRM(0, dst);
3020     }
3021 
3022     public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
3023         prefixq(src, dst);
3024         emitByte(0x0F);
3025         emitByte(0x40 | cc.getValue());
3026         emitOperandHelper(dst, src, 0);
3027     }
3028 
3029     public final void cmpq(Register dst, int imm32) {
3030         CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3031     }
3032 
3033     public final void cmpq(Register dst, Register src) {
3034         CMP.rmOp.emit(this, QWORD, dst, src);
3035     }
3036 
3037     public final void cmpq(Register dst, AMD64Address src) {
3038         CMP.rmOp.emit(this, QWORD, dst, src);
3039     }
3040 
3041     public final void cmpxchgq(Register reg, AMD64Address adr) {
3042         prefixq(adr, reg);
3043         emitByte(0x0F);
3044         emitByte(0xB1);
3045         emitOperandHelper(reg, adr, 0);
3046     }
3047 
3048     public final void cvtdq2pd(Register dst, Register src) {
3049         assert inRC(XMM, dst) && inRC(XMM, src);
3050         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3051         emitByte(0xE6);
3052         emitModRM(dst, src);
3053     }
3054 
3055     public final void cvtsi2sdq(Register dst, Register src) {
3056         SSEOp.CVTSI2SD.emit(this, QWORD, dst, src);
3057     }
3058 
3059     public final void cvttsd2siq(Register dst, Register src) {
3060         SSEOp.CVTTSD2SI.emit(this, QWORD, dst, src);
3061     }
3062 
3063     public final void cvttpd2dq(Register dst, Register src) {
3064         assert inRC(XMM, dst) && inRC(XMM, src);
3065         simdPrefix(dst, Register.None, src, PD, P_0F, false);
3066         emitByte(0xE6);
3067         emitModRM(dst, src);
3068     }
3069 
3070     public final void decq(Register dst) {
3071         // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3072         prefixq(dst);
3073         emitByte(0xFF);
3074         emitModRM(1, dst);
3075     }
3076 
3077     public final void decq(AMD64Address dst) {
3078         DEC.emit(this, QWORD, dst);
3079     }
3080 
3081     public final void imulq(Register dst, Register src) {
3082         prefixq(dst, src);
3083         emitByte(0x0F);
3084         emitByte(0xAF);
3085         emitModRM(dst, src);
3086     }
3087 
3088     public final void incq(Register dst) {
3089         // Don't use it directly. Use Macroincrementq() instead.
3090         // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3091         prefixq(dst);
3092         emitByte(0xFF);
3093         emitModRM(0, dst);
3094     }
3095 
3096     public final void incq(AMD64Address dst) {
3097         INC.emit(this, QWORD, dst);
3098     }
3099 
3100     public final void movq(Register dst, long imm64) {
3101         movq(dst, imm64, false);
3102     }
3103 
3104     public final void movq(Register dst, long imm64, boolean annotateImm) {
3105         int insnPos = position();
3106         prefixq(dst);
3107         emitByte(0xB8 + encode(dst));
3108         int immPos = position();
3109         emitLong(imm64);
3110         int nextInsnPos = position();
3111         if (annotateImm && codePatchingAnnotationConsumer != null) {
3112             codePatchingAnnotationConsumer.accept(new OperandDataAnnotation(insnPos, immPos, nextInsnPos - immPos, nextInsnPos));
3113         }
3114     }
3115 
3116     public final void movslq(Register dst, int imm32) {
3117         prefixq(dst);
3118         emitByte(0xC7);
3119         emitModRM(0, dst);
3120         emitInt(imm32);
3121     }
3122 
3123     public final void movdq(Register dst, AMD64Address src) {
3124         AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3125     }
3126 
3127     public final void movdq(AMD64Address dst, Register src) {
3128         AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3129     }
3130 
3131     public final void movdq(Register dst, Register src) {
3132         if (inRC(XMM, dst) && inRC(CPU, src)) {
3133             AMD64RMOp.MOVQ.emit(this, QWORD, dst, src);
3134         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3135             AMD64MROp.MOVQ.emit(this, QWORD, dst, src);
3136         } else {
3137             throw new InternalError("should not reach here");
3138         }
3139     }
3140 
3141     public final void movdl(Register dst, Register src) {
3142         if (inRC(XMM, dst) && inRC(CPU, src)) {
3143             AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3144         } else if (inRC(XMM, src) && inRC(CPU, dst)) {
3145             AMD64MROp.MOVD.emit(this, DWORD, dst, src);
3146         } else {
3147             throw new InternalError("should not reach here");
3148         }
3149     }
3150 
3151     public final void movdl(Register dst, AMD64Address src) {
3152         AMD64RMOp.MOVD.emit(this, DWORD, dst, src);
3153     }
3154 
3155     public final void movddup(Register dst, Register src) {
3156         assert supports(CPUFeature.SSE3);
3157         assert inRC(XMM, dst) && inRC(XMM, src);
3158         simdPrefix(dst, Register.None, src, SD, P_0F, false);
3159         emitByte(0x12);
3160         emitModRM(dst, src);
3161     }
3162 
3163     public final void movdqu(Register dst, AMD64Address src) {
3164         assert inRC(XMM, dst);
3165         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3166         emitByte(0x6F);
3167         emitOperandHelper(dst, src, 0);
3168     }
3169 
3170     public final void movdqu(Register dst, Register src) {
3171         assert inRC(XMM, dst) && inRC(XMM, src);
3172         simdPrefix(dst, Register.None, src, SS, P_0F, false);
3173         emitByte(0x6F);
3174         emitModRM(dst, src);
3175     }
3176 
3177     // Insn: VMOVDQU xmm2/m128, xmm1
3178 
3179     public final void movdqu(AMD64Address dst, Register src) {
3180         assert inRC(XMM, src);
3181         // Code: VEX.128.F3.0F.WIG 7F /r
3182         simdPrefix(src, Register.None, dst, SS, P_0F, false);
3183         emitByte(0x7F);
3184         emitOperandHelper(src, dst, 0);
3185     }
3186 
3187     public final void movslq(AMD64Address dst, int imm32) {
3188         prefixq(dst);
3189         emitByte(0xC7);
3190         emitOperandHelper(0, dst, 4);
3191         emitInt(imm32);
3192     }
3193 
3194     public final void movslq(Register dst, AMD64Address src) {
3195         prefixq(src, dst);
3196         emitByte(0x63);
3197         emitOperandHelper(dst, src, 0);
3198     }
3199 
3200     public final void movslq(Register dst, Register src) {
3201         prefixq(dst, src);
3202         emitByte(0x63);
3203         emitModRM(dst, src);
3204     }
3205 
3206     public final void negq(Register dst) {
3207         prefixq(dst);
3208         emitByte(0xF7);
3209         emitModRM(3, dst);
3210     }
3211 
3212     public final void orq(Register dst, Register src) {
3213         OR.rmOp.emit(this, QWORD, dst, src);
3214     }
3215 
3216     public final void shlq(Register dst, int imm8) {
3217         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3218         prefixq(dst);
3219         if (imm8 == 1) {
3220             emitByte(0xD1);
3221             emitModRM(4, dst);
3222         } else {
3223             emitByte(0xC1);
3224             emitModRM(4, dst);
3225             emitByte(imm8);
3226         }
3227     }
3228 
3229     public final void shlq(Register dst) {
3230         // Multiply dst by 2, CL times.
3231         prefixq(dst);
3232         emitByte(0xD3);
3233         emitModRM(4, dst);
3234     }
3235 
3236     public final void shrq(Register dst, int imm8) {
3237         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3238         prefixq(dst);
3239         if (imm8 == 1) {
3240             emitByte(0xD1);
3241             emitModRM(5, dst);
3242         } else {
3243             emitByte(0xC1);
3244             emitModRM(5, dst);
3245             emitByte(imm8);
3246         }
3247     }
3248 
3249     public final void shrq(Register dst) {
3250         prefixq(dst);
3251         emitByte(0xD3);
3252         // Unsigned divide dst by 2, CL times.
3253         emitModRM(5, dst);
3254     }
3255 
3256     public final void sarq(Register dst, int imm8) {
3257         assert isShiftCount(imm8 >> 1) : "illegal shift count";
3258         prefixq(dst);
3259         if (imm8 == 1) {
3260             emitByte(0xD1);
3261             emitModRM(7, dst);
3262         } else {
3263             emitByte(0xC1);
3264             emitModRM(7, dst);
3265             emitByte(imm8);
3266         }
3267     }
3268 
3269     public final void sbbq(Register dst, Register src) {
3270         SBB.rmOp.emit(this, QWORD, dst, src);
3271     }
3272 
3273     public final void subq(Register dst, int imm32) {
3274         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3275     }
3276 
3277     public final void subq(AMD64Address dst, int imm32) {
3278         SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
3279     }
3280 
3281     public final void subqWide(Register dst, int imm32) {
3282         // don't use the sign-extending version, forcing a 32-bit immediate
3283         SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
3284     }
3285 
3286     public final void subq(Register dst, Register src) {
3287         SUB.rmOp.emit(this, QWORD, dst, src);
3288     }
3289 
3290     public final void testq(Register dst, Register src) {
3291         prefixq(dst, src);
3292         emitByte(0x85);
3293         emitModRM(dst, src);
3294     }
3295 
3296     public final void btrq(Register src, int imm8) {
3297         prefixq(src);
3298         emitByte(0x0F);
3299         emitByte(0xBA);
3300         emitModRM(6, src);
3301         emitByte(imm8);
3302     }
3303 
3304     public final void xaddb(AMD64Address dst, Register src) {
3305         prefixb(dst, src);
3306         emitByte(0x0F);
3307         emitByte(0xC0);
3308         emitOperandHelper(src, dst, 0);
3309     }
3310 
3311     public final void xaddw(AMD64Address dst, Register src) {
3312         emitByte(0x66); // Switch to 16-bit mode.
3313         prefix(dst, src);
3314         emitByte(0x0F);
3315         emitByte(0xC1);
3316         emitOperandHelper(src, dst, 0);
3317     }
3318 
3319     public final void xaddl(AMD64Address dst, Register src) {
3320         prefix(dst, src);
3321         emitByte(0x0F);
3322         emitByte(0xC1);
3323         emitOperandHelper(src, dst, 0);
3324     }
3325 
3326     public final void xaddq(AMD64Address dst, Register src) {
3327         prefixq(dst, src);
3328         emitByte(0x0F);
3329         emitByte(0xC1);
3330         emitOperandHelper(src, dst, 0);
3331     }
3332 
3333     public final void xchgb(Register dst, AMD64Address src) {
3334         prefixb(src, dst);
3335         emitByte(0x86);
3336         emitOperandHelper(dst, src, 0);
3337     }
3338 
3339     public final void xchgw(Register dst, AMD64Address src) {
3340         emitByte(0x66);
3341         prefix(src, dst);
3342         emitByte(0x87);
3343         emitOperandHelper(dst, src, 0);
3344     }
3345 
3346     public final void xchgl(Register dst, AMD64Address src) {
3347         prefix(src, dst);
3348         emitByte(0x87);
3349         emitOperandHelper(dst, src, 0);
3350     }
3351 
3352     public final void xchgq(Register dst, AMD64Address src) {
3353         prefixq(src, dst);
3354         emitByte(0x87);
3355         emitOperandHelper(dst, src, 0);
3356     }
3357 
3358     public final void membar(int barriers) {
3359         if (target.isMP) {
3360             // We only have to handle StoreLoad
3361             if ((barriers & STORE_LOAD) != 0) {
3362                 // All usable chips support "locked" instructions which suffice
3363                 // as barriers, and are much faster than the alternative of
3364                 // using cpuid instruction. We use here a locked add [rsp],0.
3365                 // This is conveniently otherwise a no-op except for blowing
3366                 // flags.
3367                 // Any change to this code may need to revisit other places in
3368                 // the code where this idiom is used, in particular the
3369                 // orderAccess code.
3370                 lock();
3371                 addl(new AMD64Address(AMD64.rsp, 0), 0); // Assert the lock# signal here
3372             }
3373         }
3374     }
3375 
3376     @Override
3377     protected final void patchJumpTarget(int branch, int branchTarget) {
3378         int op = getByte(branch);
3379         assert op == 0xE8 // call
3380                         || op == 0x00 // jump table entry
3381                         || op == 0xE9 // jmp
3382                         || op == 0xEB // short jmp
3383                         || (op & 0xF0) == 0x70 // short jcc
3384                         || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
3385         : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
3386 
3387         if (op == 0x00) {
3388             int offsetToJumpTableBase = getShort(branch + 1);
3389             int jumpTableBase = branch - offsetToJumpTableBase;
3390             int imm32 = branchTarget - jumpTableBase;
3391             emitInt(imm32, branch);
3392         } else if (op == 0xEB || (op & 0xF0) == 0x70) {
3393 
3394             // short offset operators (jmp and jcc)
3395             final int imm8 = branchTarget - (branch + 2);
3396             /*
3397              * Since a wrongly patched short branch can potentially lead to working but really bad
3398              * behaving code we should always fail with an exception instead of having an assert.
3399              */
3400             if (!NumUtil.isByte(imm8)) {
3401                 throw new InternalError("branch displacement out of range: " + imm8);
3402             }
3403             emitByte(imm8, branch + 1);
3404 
3405         } else {
3406 
3407             int off = 1;
3408             if (op == 0x0F) {
3409                 off = 2;
3410             }
3411 
3412             int imm32 = branchTarget - (branch + 4 + off);
3413             emitInt(imm32, branch + off);
3414         }
3415     }
3416 
3417     public void nullCheck(AMD64Address address) {
3418         testl(AMD64.rax, address);
3419     }
3420 
3421     @Override
3422     public void align(int modulus) {
3423         if (position() % modulus != 0) {
3424             nop(modulus - (position() % modulus));
3425         }
3426     }
3427 
3428     /**
3429      * Emits a direct call instruction. Note that the actual call target is not specified, because
3430      * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
3431      * responsible to add the call address to the appropriate patching tables.
3432      */
3433     public final void call() {
3434         annotatePatchingImmediate(1, 4);
3435         emitByte(0xE8);
3436         emitInt(0);
3437     }
3438 
3439     public final void call(Register src) {
3440         prefix(src);
3441         emitByte(0xFF);
3442         emitModRM(2, src);
3443     }
3444 
3445     public final void int3() {
3446         emitByte(0xCC);
3447     }
3448 
3449     public final void pause() {
3450         emitByte(0xF3);
3451         emitByte(0x90);
3452     }
3453 
3454     private void emitx87(int b1, int b2, int i) {
3455         assert 0 <= i && i < 8 : "illegal stack offset";
3456         emitByte(b1);
3457         emitByte(b2 + i);
3458     }
3459 
3460     public final void fldd(AMD64Address src) {
3461         emitByte(0xDD);
3462         emitOperandHelper(0, src, 0);
3463     }
3464 
3465     public final void flds(AMD64Address src) {
3466         emitByte(0xD9);
3467         emitOperandHelper(0, src, 0);
3468     }
3469 
3470     public final void fldln2() {
3471         emitByte(0xD9);
3472         emitByte(0xED);
3473     }
3474 
3475     public final void fldlg2() {
3476         emitByte(0xD9);
3477         emitByte(0xEC);
3478     }
3479 
3480     public final void fyl2x() {
3481         emitByte(0xD9);
3482         emitByte(0xF1);
3483     }
3484 
3485     public final void fstps(AMD64Address src) {
3486         emitByte(0xD9);
3487         emitOperandHelper(3, src, 0);
3488     }
3489 
3490     public final void fstpd(AMD64Address src) {
3491         emitByte(0xDD);
3492         emitOperandHelper(3, src, 0);
3493     }
3494 
3495     private void emitFPUArith(int b1, int b2, int i) {
3496         assert 0 <= i && i < 8 : "illegal FPU register: " + i;
3497         emitByte(b1);
3498         emitByte(b2 + i);
3499     }
3500 
3501     public void ffree(int i) {
3502         emitFPUArith(0xDD, 0xC0, i);
3503     }
3504 
3505     public void fincstp() {
3506         emitByte(0xD9);
3507         emitByte(0xF7);
3508     }
3509 
3510     public void fxch(int i) {
3511         emitFPUArith(0xD9, 0xC8, i);
3512     }
3513 
3514     public void fnstswAX() {
3515         emitByte(0xDF);
3516         emitByte(0xE0);
3517     }
3518 
3519     public void fwait() {
3520         emitByte(0x9B);
3521     }
3522 
3523     public void fprem() {
3524         emitByte(0xD9);
3525         emitByte(0xF8);
3526     }
3527 
3528     public final void fsin() {
3529         emitByte(0xD9);
3530         emitByte(0xFE);
3531     }
3532 
3533     public final void fcos() {
3534         emitByte(0xD9);
3535         emitByte(0xFF);
3536     }
3537 
3538     public final void fptan() {
3539         emitByte(0xD9);
3540         emitByte(0xF2);
3541     }
3542 
3543     public final void fstp(int i) {
3544         emitx87(0xDD, 0xD8, i);
3545     }
3546 
3547     @Override
3548     public AMD64Address makeAddress(Register base, int displacement) {
3549         return new AMD64Address(base, displacement);
3550     }
3551 
3552     @Override
3553     public AMD64Address getPlaceholder(int instructionStartPosition) {
3554         return new AMD64Address(AMD64.rip, Register.None, Scale.Times1, 0, instructionStartPosition);
3555     }
3556 
3557     private void prefetchPrefix(AMD64Address src) {
3558         prefix(src);
3559         emitByte(0x0F);
3560     }
3561 
3562     public void prefetchnta(AMD64Address src) {
3563         prefetchPrefix(src);
3564         emitByte(0x18);
3565         emitOperandHelper(0, src, 0);
3566     }
3567 
3568     void prefetchr(AMD64Address src) {
3569         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3570         prefetchPrefix(src);
3571         emitByte(0x0D);
3572         emitOperandHelper(0, src, 0);
3573     }
3574 
3575     public void prefetcht0(AMD64Address src) {
3576         assert supports(CPUFeature.SSE);
3577         prefetchPrefix(src);
3578         emitByte(0x18);
3579         emitOperandHelper(1, src, 0);
3580     }
3581 
3582     public void prefetcht1(AMD64Address src) {
3583         assert supports(CPUFeature.SSE);
3584         prefetchPrefix(src);
3585         emitByte(0x18);
3586         emitOperandHelper(2, src, 0);
3587     }
3588 
3589     public void prefetcht2(AMD64Address src) {
3590         assert supports(CPUFeature.SSE);
3591         prefix(src);
3592         emitByte(0x0f);
3593         emitByte(0x18);
3594         emitOperandHelper(3, src, 0);
3595     }
3596 
3597     public void prefetchw(AMD64Address src) {
3598         assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
3599         prefix(src);
3600         emitByte(0x0f);
3601         emitByte(0x0D);
3602         emitOperandHelper(1, src, 0);
3603     }
3604 
3605     public void rdtsc() {
3606         emitByte(0x0F);
3607         emitByte(0x31);
3608     }
3609 
3610     /**
3611      * Emits an instruction which is considered to be illegal. This is used if we deliberately want
3612      * to crash the program (debugging etc.).
3613      */
3614     public void illegal() {
3615         emitByte(0x0f);
3616         emitByte(0x0b);
3617     }
3618 
3619     public void lfence() {
3620         emitByte(0x0f);
3621         emitByte(0xae);
3622         emitByte(0xe8);
3623     }
3624 
3625     public final void vptest(Register dst, Register src) {
3626         VexRMOp.VPTEST.emit(this, AVXSize.YMM, dst, src);
3627     }
3628 
3629     public final void vpxor(Register dst, Register nds, Register src) {
3630         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3631     }
3632 
3633     public final void vpxor(Register dst, Register nds, AMD64Address src) {
3634         VexRVMOp.VPXOR.emit(this, AVXSize.YMM, dst, nds, src);
3635     }
3636 
3637     public final void vmovdqu(Register dst, AMD64Address src) {
3638         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3639     }
3640 
3641     public final void vmovdqu(AMD64Address dst, Register src) {
3642         assert inRC(XMM, src);
3643         VexMoveOp.VMOVDQU.emit(this, AVXSize.YMM, dst, src);
3644     }
3645 
3646     public final void vpmovzxbw(Register dst, AMD64Address src) {
3647         assert supports(CPUFeature.AVX2);
3648         VexRMOp.VPMOVZXBW.emit(this, AVXSize.YMM, dst, src);
3649     }
3650 
3651     public final void vzeroupper() {
3652         emitVEX(L128, P_, M_0F, W0, 0, 0, true);
3653         emitByte(0x77);
3654     }
3655 
3656     // Insn: KORTESTD k1, k2
3657 
3658     // This instruction produces ZF or CF flags
3659     public final void kortestd(Register src1, Register src2) {
3660         assert supports(CPUFeature.AVX512BW);
3661         assert inRC(MASK, src1) && inRC(MASK, src2);
3662         // Code: VEX.L0.66.0F.W1 98 /r
3663         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3664         emitByte(0x98);
3665         emitModRM(src1, src2);
3666     }
3667 
3668     // Insn: KORTESTQ k1, k2
3669 
3670     // This instruction produces ZF or CF flags
3671     public final void kortestq(Register src1, Register src2) {
3672         assert supports(CPUFeature.AVX512BW);
3673         assert inRC(MASK, src1) && inRC(MASK, src2);
3674         // Code: VEX.L0.0F.W1 98 /r
3675         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_, M_0F, W1, true);
3676         emitByte(0x98);
3677         emitModRM(src1, src2);
3678     }
3679 
3680     public final void kmovd(Register dst, Register src) {
3681         assert supports(CPUFeature.AVX512BW);
3682         assert inRC(MASK, dst) || inRC(CPU, dst);
3683         assert inRC(MASK, src) || inRC(CPU, src);
3684         assert !(inRC(CPU, dst) && inRC(CPU, src));
3685 
3686         if (inRC(MASK, dst)) {
3687             if (inRC(MASK, src)) {
3688                 // kmovd(KRegister dst, KRegister src):
3689                 // Insn: KMOVD k1, k2/m32
3690                 // Code: VEX.L0.66.0F.W1 90 /r
3691                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_66, M_0F, W1, true);
3692                 emitByte(0x90);
3693                 emitModRM(dst, src);
3694             } else {
3695                 // kmovd(KRegister dst, Register src)
3696                 // Insn: KMOVD k1, r32
3697                 // Code: VEX.L0.F2.0F.W0 92 /r
3698                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3699                 emitByte(0x92);
3700                 emitModRM(dst, src);
3701             }
3702         } else {
3703             if (inRC(MASK, src)) {
3704                 // kmovd(Register dst, KRegister src)
3705                 // Insn: KMOVD r32, k1
3706                 // Code: VEX.L0.F2.0F.W0 93 /r
3707                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W0, true);
3708                 emitByte(0x93);
3709                 emitModRM(dst, src);
3710             } else {
3711                 throw GraalError.shouldNotReachHere();
3712             }
3713         }
3714     }
3715 
3716     public final void kmovq(Register dst, Register src) {
3717         assert supports(CPUFeature.AVX512BW);
3718         assert inRC(MASK, dst) || inRC(CPU, dst);
3719         assert inRC(MASK, src) || inRC(CPU, src);
3720         assert !(inRC(CPU, dst) && inRC(CPU, src));
3721 
3722         if (inRC(MASK, dst)) {
3723             if (inRC(MASK, src)) {
3724                 // kmovq(KRegister dst, KRegister src):
3725                 // Insn: KMOVQ k1, k2/m64
3726                 // Code: VEX.L0.0F.W1 90 /r
3727                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_, M_0F, W1, true);
3728                 emitByte(0x90);
3729                 emitModRM(dst, src);
3730             } else {
3731                 // kmovq(KRegister dst, Register src)
3732                 // Insn: KMOVQ k1, r64
3733                 // Code: VEX.L0.F2.0F.W1 92 /r
3734                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3735                 emitByte(0x92);
3736                 emitModRM(dst, src);
3737             }
3738         } else {
3739             if (inRC(MASK, src)) {
3740                 // kmovq(Register dst, KRegister src)
3741                 // Insn: KMOVQ r64, k1
3742                 // Code: VEX.L0.F2.0F.W1 93 /r
3743                 vexPrefix(dst, Register.None, src, AVXSize.XMM, P_F2, M_0F, W1, true);
3744                 emitByte(0x93);
3745                 emitModRM(dst, src);
3746             } else {
3747                 throw GraalError.shouldNotReachHere();
3748             }
3749         }
3750     }
3751 
3752     // Insn: KTESTD k1, k2
3753 
3754     public final void ktestd(Register src1, Register src2) {
3755         assert supports(CPUFeature.AVX512BW);
3756         assert inRC(MASK, src1) && inRC(MASK, src2);
3757         // Code: VEX.L0.66.0F.W1 99 /r
3758         vexPrefix(src1, Register.None, src2, AVXSize.XMM, P_66, M_0F, W1, true);
3759         emitByte(0x99);
3760         emitModRM(src1, src2);
3761     }
3762 
3763     public final void evmovdqu64(Register dst, AMD64Address src) {
3764         assert supports(CPUFeature.AVX512F);
3765         assert inRC(XMM, dst);
3766         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F3, M_0F, W1, Z0, B0);
3767         emitByte(0x6F);
3768         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3769     }
3770 
3771     // Insn: VPMOVZXBW zmm1, m256
3772 
3773     public final void evpmovzxbw(Register dst, AMD64Address src) {
3774         assert supports(CPUFeature.AVX512BW);
3775         assert inRC(XMM, dst);
3776         // Code: EVEX.512.66.0F38.WIG 30 /r
3777         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3778         emitByte(0x30);
3779         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3780     }
3781 
3782     public final void evpcmpeqb(Register kdst, Register nds, AMD64Address src) {
3783         assert supports(CPUFeature.AVX512BW);
3784         assert inRC(MASK, kdst) && inRC(XMM, nds);
3785         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F, WIG, Z0, B0);
3786         emitByte(0x74);
3787         emitEVEXOperandHelper(kdst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3788     }
3789 
3790     // Insn: VMOVDQU16 zmm1 {k1}{z}, zmm2/m512
3791     // -----
3792     // Insn: VMOVDQU16 zmm1, m512
3793 
3794     public final void evmovdqu16(Register dst, AMD64Address src) {
3795         assert supports(CPUFeature.AVX512BW);
3796         assert inRC(XMM, dst);
3797         // Code: EVEX.512.F2.0F.W1 6F /r
3798         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3799         emitByte(0x6F);
3800         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3801     }
3802 
3803     // Insn: VMOVDQU16 zmm1, k1:z, m512
3804 
3805     public final void evmovdqu16(Register dst, Register mask, AMD64Address src) {
3806         assert supports(CPUFeature.AVX512BW);
3807         assert inRC(XMM, dst) && inRC(MASK, mask);
3808         // Code: EVEX.512.F2.0F.W1 6F /r
3809         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_F2, M_0F, W1, Z1, B0);
3810         emitByte(0x6F);
3811         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3812     }
3813 
3814     // Insn: VMOVDQU16 zmm2/m512 {k1}{z}, zmm1
3815     // -----
3816     // Insn: VMOVDQU16 m512, zmm1
3817 
3818     public final void evmovdqu16(AMD64Address dst, Register src) {
3819         assert supports(CPUFeature.AVX512BW);
3820         assert inRC(XMM, src);
3821         // Code: EVEX.512.F2.0F.W1 7F /r
3822         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3823         emitByte(0x7F);
3824         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3825     }
3826 
3827     // Insn: VMOVDQU16 m512, k1, zmm1
3828 
3829     public final void evmovdqu16(AMD64Address dst, Register mask, Register src) {
3830         assert supports(CPUFeature.AVX512BW);
3831         assert inRC(MASK, mask) && inRC(XMM, src);
3832         // Code: EVEX.512.F2.0F.W1 7F /r
3833         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F2, M_0F, W1, Z0, B0);
3834         emitByte(0x7F);
3835         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.FVM.getDisp8ScalingFactor(AVXSize.ZMM));
3836     }
3837 
3838     // Insn: VPBROADCASTW zmm1 {k1}{z}, reg
3839     // -----
3840     // Insn: VPBROADCASTW zmm1, reg
3841 
3842     public final void evpbroadcastw(Register dst, Register src) {
3843         assert supports(CPUFeature.AVX512BW);
3844         assert inRC(XMM, dst) && inRC(CPU, src);
3845         // Code: EVEX.512.66.0F38.W0 7B /r
3846         evexPrefix(dst, Register.None, Register.None, src, AVXSize.ZMM, P_66, M_0F38, W0, Z0, B0);
3847         emitByte(0x7B);
3848         emitModRM(dst, src);
3849     }
3850 
3851     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3852     // -----
3853     // Insn: VPCMPUW k1, zmm2, zmm3, imm8
3854 
3855     public final void evpcmpuw(Register kdst, Register nds, Register src, int vcc) {
3856         assert supports(CPUFeature.AVX512BW);
3857         assert inRC(MASK, kdst) && inRC(XMM, nds) && inRC(XMM, src);
3858         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3859         evexPrefix(kdst, Register.None, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3860         emitByte(0x3E);
3861         emitModRM(kdst, src);
3862         emitByte(vcc);
3863     }
3864 
3865     // Insn: VPCMPUW k1 {k2}, zmm2, zmm3/m512, imm8
3866     // -----
3867     // Insn: VPCMPUW k1, k2, zmm2, zmm3, imm8
3868 
3869     public final void evpcmpuw(Register kdst, Register mask, Register nds, Register src, int vcc) {
3870         assert supports(CPUFeature.AVX512BW);
3871         assert inRC(MASK, kdst) && inRC(MASK, mask);
3872         assert inRC(XMM, nds) && inRC(XMM, src);
3873         // Code: EVEX.NDS.512.66.0F3A.W1 3E /r ib
3874         evexPrefix(kdst, mask, nds, src, AVXSize.ZMM, P_66, M_0F3A, W1, Z0, B0);
3875         emitByte(0x3E);
3876         emitModRM(kdst, src);
3877         emitByte(vcc);
3878     }
3879 
3880     // Insn: VPMOVWB ymm1/m256 {k1}{z}, zmm2
3881     // -----
3882     // Insn: VPMOVWB m256, zmm2
3883 
3884     public final void evpmovwb(AMD64Address dst, Register src) {
3885         assert supports(CPUFeature.AVX512BW);
3886         assert inRC(XMM, src);
3887         // Code: EVEX.512.F3.0F38.W0 30 /r
3888         evexPrefix(src, Register.None, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3889         emitByte(0x30);
3890         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3891     }
3892 
3893     // Insn: VPMOVWB m256, k1, zmm2
3894 
3895     public final void evpmovwb(AMD64Address dst, Register mask, Register src) {
3896         assert supports(CPUFeature.AVX512BW);
3897         assert inRC(MASK, mask) && inRC(XMM, src);
3898         // Code: EVEX.512.F3.0F38.W0 30 /r
3899         evexPrefix(src, mask, Register.None, dst, AVXSize.ZMM, P_F3, M_0F38, W0, Z0, B0);
3900         emitByte(0x30);
3901         emitEVEXOperandHelper(src, dst, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3902     }
3903 
3904     // Insn: VPMOVZXBW zmm1 {k1}{z}, ymm2/m256
3905     // -----
3906     // Insn: VPMOVZXBW zmm1, k1, m256
3907 
3908     public final void evpmovzxbw(Register dst, Register mask, AMD64Address src) {
3909         assert supports(CPUFeature.AVX512BW);
3910         assert inRC(MASK, mask) && inRC(XMM, dst);
3911         // Code: EVEX.512.66.0F38.WIG 30 /r
3912         evexPrefix(dst, mask, Register.None, src, AVXSize.ZMM, P_66, M_0F38, WIG, Z0, B0);
3913         emitByte(0x30);
3914         emitEVEXOperandHelper(dst, src, 0, EVEXTuple.HVM.getDisp8ScalingFactor(AVXSize.ZMM));
3915     }
3916 
3917 }