--- /dev/null	2017-01-22 10:16:57.869617664 -0800
+++ new/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.asm.amd64/src/org/graalvm/compiler/asm/amd64/AMD64Assembler.java	2017-02-15 16:56:36.473389113 -0800
@@ -0,0 +1,3750 @@
+/*
+ * Copyright (c) 2009, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.graalvm.compiler.asm.amd64;
+
+import static org.graalvm.compiler.asm.NumUtil.isByte;
+import static org.graalvm.compiler.asm.NumUtil.isInt;
+import static org.graalvm.compiler.asm.NumUtil.isShiftCount;
+import static org.graalvm.compiler.asm.NumUtil.isUByte;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseAddressNop;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseNormalNop;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.ADD;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.AND;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.CMP;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.OR;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SBB;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.SUB;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.DEC;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.INC;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NEG;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64MOp.NOT;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.BYTE;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.DWORD;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.PD;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.PS;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.QWORD;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.SD;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.SS;
+import static org.graalvm.compiler.asm.amd64.AMD64Assembler.OperandSize.WORD;
+import static jdk.vm.ci.amd64.AMD64.CPU;
+import static jdk.vm.ci.amd64.AMD64.XMM;
+import static jdk.vm.ci.amd64.AMD64.r12;
+import static jdk.vm.ci.amd64.AMD64.r13;
+import static jdk.vm.ci.amd64.AMD64.rbp;
+import static jdk.vm.ci.amd64.AMD64.rip;
+import static jdk.vm.ci.amd64.AMD64.rsp;
+import static jdk.vm.ci.code.MemoryBarriers.STORE_LOAD;
+
+import org.graalvm.compiler.asm.Assembler;
+import org.graalvm.compiler.asm.Label;
+import org.graalvm.compiler.asm.NumUtil;
+import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
+
+import jdk.vm.ci.amd64.AMD64;
+import jdk.vm.ci.amd64.AMD64.CPUFeature;
+import jdk.vm.ci.code.Register;
+import jdk.vm.ci.code.Register.RegisterCategory;
+import jdk.vm.ci.code.TargetDescription;
+
+/**
+ * This class implements an assembler that can encode most X86 instructions.
+ */
+public class AMD64Assembler extends Assembler {
+
+    private static final int MinEncodingNeedsRex = 8;
+
+    /**
+     * The x86 condition codes used for conditional jumps/moves.
+     */
+    public enum ConditionFlag {
+        Zero(0x4, "|zero|"),
+        NotZero(0x5, "|nzero|"),
+        Equal(0x4, "="),
+        NotEqual(0x5, "!="),
+        Less(0xc, "<"),
+        LessEqual(0xe, "<="),
+        Greater(0xf, ">"),
+        GreaterEqual(0xd, ">="),
+        Below(0x2, "|<|"),
+        BelowEqual(0x6, "|<=|"),
+        Above(0x7, "|>|"),
+        AboveEqual(0x3, "|>=|"),
+        Overflow(0x0, "|of|"),
+        NoOverflow(0x1, "|nof|"),
+        CarrySet(0x2, "|carry|"),
+        CarryClear(0x3, "|ncarry|"),
+        Negative(0x8, "|neg|"),
+        Positive(0x9, "|pos|"),
+        Parity(0xa, "|par|"),
+        NoParity(0xb, "|npar|");
+
+        private final int value;
+        private final String operator;
+
+        ConditionFlag(int value, String operator) {
+            this.value = value;
+            this.operator = operator;
+        }
+
+        public ConditionFlag negate() {
+            switch (this) {
+                case Zero:
+                    return NotZero;
+                case NotZero:
+                    return Zero;
+                case Equal:
+                    return NotEqual;
+                case NotEqual:
+                    return Equal;
+                case Less:
+                    return GreaterEqual;
+                case LessEqual:
+                    return Greater;
+                case Greater:
+                    return LessEqual;
+                case GreaterEqual:
+                    return Less;
+                case Below:
+                    return AboveEqual;
+                case BelowEqual:
+                    return Above;
+                case Above:
+                    return BelowEqual;
+                case AboveEqual:
+                    return Below;
+                case Overflow:
+                    return NoOverflow;
+                case NoOverflow:
+                    return Overflow;
+                case CarrySet:
+                    return CarryClear;
+                case CarryClear:
+                    return CarrySet;
+                case Negative:
+                    return Positive;
+                case Positive:
+                    return Negative;
+                case Parity:
+                    return NoParity;
+                case NoParity:
+                    return Parity;
+            }
+            throw new IllegalArgumentException();
+        }
+
+        public int getValue() {
+            return value;
+        }
+
+        @Override
+        public String toString() {
+            return operator;
+        }
+    }
+
+    /**
+     * Constants for X86 prefix bytes.
+     */
+    private static class Prefix {
+        private static final int REX = 0x40;
+        private static final int REXB = 0x41;
+        private static final int REXX = 0x42;
+        private static final int REXXB = 0x43;
+        private static final int REXR = 0x44;
+        private static final int REXRB = 0x45;
+        private static final int REXRX = 0x46;
+        private static final int REXRXB = 0x47;
+        private static final int REXW = 0x48;
+        private static final int REXWB = 0x49;
+        private static final int REXWX = 0x4A;
+        private static final int REXWXB = 0x4B;
+        private static final int REXWR = 0x4C;
+        private static final int REXWRB = 0x4D;
+        private static final int REXWRX = 0x4E;
+        private static final int REXWRXB = 0x4F;
+        private static final int VEX_3BYTES = 0xC4;
+        private static final int VEX_2BYTES = 0xC5;
+    }
+
+    private static class VexPrefix {
+        private static final int VEX_R = 0x80;
+        private static final int VEX_W = 0x80;
+    }
+
+    private static class AvxVectorLen {
+        private static final int AVX_128bit = 0x0;
+        private static final int AVX_256bit = 0x1;
+    }
+
+    private static class VexSimdPrefix {
+        private static final int VEX_SIMD_NONE = 0x0;
+        private static final int VEX_SIMD_66 = 0x1;
+        private static final int VEX_SIMD_F3 = 0x2;
+        private static final int VEX_SIMD_F2 = 0x3;
+    }
+
+    private static class VexOpcode {
+        private static final int VEX_OPCODE_NONE = 0x0;
+        private static final int VEX_OPCODE_0F = 0x1;
+        private static final int VEX_OPCODE_0F_38 = 0x2;
+        private static final int VEX_OPCODE_0F_3A = 0x3;
+    }
+
+    private AMD64InstructionAttr curAttributes;
+
+    AMD64InstructionAttr getCurAttributes() {
+        return curAttributes;
+    }
+
+    void setCurAttributes(AMD64InstructionAttr attributes) {
+        curAttributes = attributes;
+    }
+
+    /**
+     * The x86 operand sizes.
+     */
+    public enum OperandSize {
+        BYTE(1) {
+            @Override
+            protected void emitImmediate(AMD64Assembler asm, int imm) {
+                assert imm == (byte) imm;
+                asm.emitByte(imm);
+            }
+
+            @Override
+            protected int immediateSize() {
+                return 1;
+            }
+        },
+
+        WORD(2, 0x66) {
+            @Override
+            protected void emitImmediate(AMD64Assembler asm, int imm) {
+                assert imm == (short) imm;
+                asm.emitShort(imm);
+            }
+
+            @Override
+            protected int immediateSize() {
+                return 2;
+            }
+        },
+
+        DWORD(4) {
+            @Override
+            protected void emitImmediate(AMD64Assembler asm, int imm) {
+                asm.emitInt(imm);
+            }
+
+            @Override
+            protected int immediateSize() {
+                return 4;
+            }
+        },
+
+        QWORD(8) {
+            @Override
+            protected void emitImmediate(AMD64Assembler asm, int imm) {
+                asm.emitInt(imm);
+            }
+
+            @Override
+            protected int immediateSize() {
+                return 4;
+            }
+        },
+
+        SS(4, 0xF3, true),
+
+        SD(8, 0xF2, true),
+
+        PS(16, true),
+
+        PD(16, 0x66, true);
+
+        private final int sizePrefix;
+
+        private final int bytes;
+        private final boolean xmm;
+
+        OperandSize(int bytes) {
+            this(bytes, 0);
+        }
+
+        OperandSize(int bytes, int sizePrefix) {
+            this(bytes, sizePrefix, false);
+        }
+
+        OperandSize(int bytes, boolean xmm) {
+            this(bytes, 0, xmm);
+        }
+
+        OperandSize(int bytes, int sizePrefix, boolean xmm) {
+            this.sizePrefix = sizePrefix;
+            this.bytes = bytes;
+            this.xmm = xmm;
+        }
+
+        public int getBytes() {
+            return bytes;
+        }
+
+        public boolean isXmmType() {
+            return xmm;
+        }
+
+        /**
+         * Emit an immediate of this size. Note that immediate {@link #QWORD} operands are encoded
+         * as sign-extended 32-bit values.
+         *
+         * @param asm
+         * @param imm
+         */
+        protected void emitImmediate(AMD64Assembler asm, int imm) {
+            throw new UnsupportedOperationException();
+        }
+
+        protected int immediateSize() {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    /**
+     * Operand size and register type constraints.
+     */
+    private enum OpAssertion {
+        ByteAssertion(CPU, CPU, BYTE),
+        IntegerAssertion(CPU, CPU, WORD, DWORD, QWORD),
+        No16BitAssertion(CPU, CPU, DWORD, QWORD),
+        No32BitAssertion(CPU, CPU, WORD, QWORD),
+        QwordOnlyAssertion(CPU, CPU, QWORD),
+        FloatingAssertion(XMM, XMM, SS, SD, PS, PD),
+        PackedFloatingAssertion(XMM, XMM, PS, PD),
+        SingleAssertion(XMM, XMM, SS),
+        DoubleAssertion(XMM, XMM, SD),
+        PackedDoubleAssertion(XMM, XMM, PD),
+        IntToFloatingAssertion(XMM, CPU, DWORD, QWORD),
+        FloatingToIntAssertion(CPU, XMM, DWORD, QWORD);
+
+        private final RegisterCategory resultCategory;
+        private final RegisterCategory inputCategory;
+        private final OperandSize[] allowedSizes;
+
+        OpAssertion(RegisterCategory resultCategory, RegisterCategory inputCategory, OperandSize... allowedSizes) {
+            this.resultCategory = resultCategory;
+            this.inputCategory = inputCategory;
+            this.allowedSizes = allowedSizes;
+        }
+
+        protected boolean checkOperands(AMD64Op op, OperandSize size, Register resultReg, Register inputReg) {
+            assert resultReg == null || resultCategory.equals(resultReg.getRegisterCategory()) : "invalid result register " + resultReg + " used in " + op;
+            assert inputReg == null || inputCategory.equals(inputReg.getRegisterCategory()) : "invalid input register " + inputReg + " used in " + op;
+
+            for (OperandSize s : allowedSizes) {
+                if (size == s) {
+                    return true;
+                }
+            }
+
+            assert false : "invalid operand size " + size + " used in " + op;
+            return false;
+        }
+    }
+
+    public abstract static class OperandDataAnnotation extends CodeAnnotation {
+        /**
+         * The position (bytes from the beginning of the method) of the operand.
+         */
+        public final int operandPosition;
+        /**
+         * The size of the operand, in bytes.
+         */
+        public final int operandSize;
+        /**
+         * The position (bytes from the beginning of the method) of the next instruction. On AMD64,
+         * RIP-relative operands are relative to this position.
+         */
+        public final int nextInstructionPosition;
+
+        OperandDataAnnotation(int instructionPosition, int operandPosition, int operandSize, int nextInstructionPosition) {
+            super(instructionPosition);
+
+            this.operandPosition = operandPosition;
+            this.operandSize = operandSize;
+            this.nextInstructionPosition = nextInstructionPosition;
+        }
+
+        @Override
+        public String toString() {
+            return getClass().getSimpleName() + " instruction [" + instructionPosition + ", " + nextInstructionPosition + "[ operand at " + operandPosition + " size " + operandSize;
+        }
+    }
+
+    /**
+     * Annotation that stores additional information about the displacement of a
+     * {@link Assembler#getPlaceholder placeholder address} that needs patching.
+     */
+    public static class AddressDisplacementAnnotation extends OperandDataAnnotation {
+        AddressDisplacementAnnotation(int instructionPosition, int operandPosition, int operndSize, int nextInstructionPosition) {
+            super(instructionPosition, operandPosition, operndSize, nextInstructionPosition);
+        }
+    }
+
+    /**
+     * Annotation that stores additional information about the immediate operand, e.g., of a call
+     * instruction, that needs patching.
+     */
+    public static class ImmediateOperandAnnotation extends OperandDataAnnotation {
+        ImmediateOperandAnnotation(int instructionPosition, int operandPosition, int operndSize, int nextInstructionPosition) {
+            super(instructionPosition, operandPosition, operndSize, nextInstructionPosition);
+        }
+    }
+
+    /**
+     * Constructs an assembler for the AMD64 architecture.
+     */
+    public AMD64Assembler(TargetDescription target) {
+        super(target);
+    }
+
+    public boolean supports(CPUFeature feature) {
+        return ((AMD64) target.arch).getFeatures().contains(feature);
+    }
+
+    private static int encode(Register r) {
+        assert r.encoding < 16 && r.encoding >= 0 : "encoding out of range: " + r.encoding;
+        return r.encoding & 0x7;
+    }
+
+    /**
+     * Get RXB bits for register-register instruction. In that encoding, ModRM.rm contains a
+     * register index. The R bit extends the ModRM.reg field and the B bit extends the ModRM.rm
+     * field. The X bit must be 0.
+     */
+    protected static int getRXB(Register reg, Register rm) {
+        int rxb = (reg == null ? 0 : reg.encoding & 0x08) >> 1;
+        rxb |= (rm == null ? 0 : rm.encoding & 0x08) >> 3;
+        return rxb;
+    }
+
+    /**
+     * Get RXB bits for register-memory instruction. The R bit extends the ModRM.reg field. There
+     * are two cases for the memory operand:<br>
+     * ModRM.rm contains the base register: In that case, B extends the ModRM.rm field and X = 0.
+     * <br>
+     * There is an SIB byte: In that case, X extends SIB.index and B extends SIB.base.
+     */
+    protected static int getRXB(Register reg, AMD64Address rm) {
+        int rxb = (reg == null ? 0 : reg.encoding & 0x08) >> 1;
+        if (!rm.getIndex().equals(Register.None)) {
+            rxb |= (rm.getIndex().encoding & 0x08) >> 2;
+        }
+        if (!rm.getBase().equals(Register.None)) {
+            rxb |= (rm.getBase().encoding & 0x08) >> 3;
+        }
+        return rxb;
+    }
+
+    /**
+     * Emit the ModR/M byte for one register operand and an opcode extension in the R field.
+     * <p>
+     * Format: [ 11 reg r/m ]
+     */
+    protected void emitModRM(int reg, Register rm) {
+        assert (reg & 0x07) == reg;
+        emitByte(0xC0 | (reg << 3) | (rm.encoding & 0x07));
+    }
+
+    /**
+     * Emit the ModR/M byte for two register operands.
+     * <p>
+     * Format: [ 11 reg r/m ]
+     */
+    protected void emitModRM(Register reg, Register rm) {
+        emitModRM(reg.encoding & 0x07, rm);
+    }
+
+    protected void emitOperandHelper(Register reg, AMD64Address addr, int additionalInstructionSize) {
+        assert !reg.equals(Register.None);
+        emitOperandHelper(encode(reg), addr, false, additionalInstructionSize);
+    }
+
+    /**
+     * Emits the ModR/M byte and optionally the SIB byte for one register and one memory operand.
+     *
+     * @param force4Byte use 4 byte encoding for displacements that would normally fit in a byte
+     */
+    protected void emitOperandHelper(Register reg, AMD64Address addr, boolean force4Byte, int additionalInstructionSize) {
+        assert !reg.equals(Register.None);
+        emitOperandHelper(encode(reg), addr, force4Byte, additionalInstructionSize);
+    }
+
+    protected void emitOperandHelper(int reg, AMD64Address addr, int additionalInstructionSize) {
+        emitOperandHelper(reg, addr, false, additionalInstructionSize);
+    }
+
+    /**
+     * Emits the ModR/M byte and optionally the SIB byte for one memory operand and an opcode
+     * extension in the R field.
+     *
+     * @param force4Byte use 4 byte encoding for displacements that would normally fit in a byte
+     * @param additionalInstructionSize the number of bytes that will be emitted after the operand,
+     *            so that the start position of the next instruction can be computed even though
+     *            this instruction has not been completely emitted yet.
+     */
+    protected void emitOperandHelper(int reg, AMD64Address addr, boolean force4Byte, int additionalInstructionSize) {
+        assert (reg & 0x07) == reg;
+        int regenc = reg << 3;
+
+        Register base = addr.getBase();
+        Register index = addr.getIndex();
+
+        AMD64Address.Scale scale = addr.getScale();
+        int disp = addr.getDisplacement();
+
+        if (base.equals(AMD64.rip)) { // also matches addresses returned by getPlaceholder()
+            // [00 000 101] disp32
+            assert index.equals(Register.None) : "cannot use RIP relative addressing with index register";
+            emitByte(0x05 | regenc);
+            if (codePatchingAnnotationConsumer != null && addr.instructionStartPosition >= 0) {
+                codePatchingAnnotationConsumer.accept(new AddressDisplacementAnnotation(addr.instructionStartPosition, position(), 4, position() + 4 + additionalInstructionSize));
+            }
+            emitInt(disp);
+        } else if (base.isValid()) {
+            int baseenc = base.isValid() ? encode(base) : 0;
+            if (index.isValid()) {
+                int indexenc = encode(index) << 3;
+                // [base + indexscale + disp]
+                if (disp == 0 && !base.equals(rbp) && !base.equals(r13)) {
+                    // [base + indexscale]
+                    // [00 reg 100][ss index base]
+                    assert !index.equals(rsp) : "illegal addressing mode";
+                    emitByte(0x04 | regenc);
+                    emitByte(scale.log2 << 6 | indexenc | baseenc);
+                } else if (isByte(disp) && !force4Byte) {
+                    // [base + indexscale + imm8]
+                    // [01 reg 100][ss index base] imm8
+                    assert !index.equals(rsp) : "illegal addressing mode";
+                    emitByte(0x44 | regenc);
+                    emitByte(scale.log2 << 6 | indexenc | baseenc);
+                    emitByte(disp & 0xFF);
+                } else {
+                    // [base + indexscale + disp32]
+                    // [10 reg 100][ss index base] disp32
+                    assert !index.equals(rsp) : "illegal addressing mode";
+                    emitByte(0x84 | regenc);
+                    emitByte(scale.log2 << 6 | indexenc | baseenc);
+                    emitInt(disp);
+                }
+            } else if (base.equals(rsp) || base.equals(r12)) {
+                // [rsp + disp]
+                if (disp == 0) {
+                    // [rsp]
+                    // [00 reg 100][00 100 100]
+                    emitByte(0x04 | regenc);
+                    emitByte(0x24);
+                } else if (isByte(disp) && !force4Byte) {
+                    // [rsp + imm8]
+                    // [01 reg 100][00 100 100] disp8
+                    emitByte(0x44 | regenc);
+                    emitByte(0x24);
+                    emitByte(disp & 0xFF);
+                } else {
+                    // [rsp + imm32]
+                    // [10 reg 100][00 100 100] disp32
+                    emitByte(0x84 | regenc);
+                    emitByte(0x24);
+                    emitInt(disp);
+                }
+            } else {
+                // [base + disp]
+                assert !base.equals(rsp) && !base.equals(r12) : "illegal addressing mode";
+                if (disp == 0 && !base.equals(rbp) && !base.equals(r13)) {
+                    // [base]
+                    // [00 reg base]
+                    emitByte(0x00 | regenc | baseenc);
+                } else if (isByte(disp) && !force4Byte) {
+                    // [base + disp8]
+                    // [01 reg base] disp8
+                    emitByte(0x40 | regenc | baseenc);
+                    emitByte(disp & 0xFF);
+                } else {
+                    // [base + disp32]
+                    // [10 reg base] disp32
+                    emitByte(0x80 | regenc | baseenc);
+                    emitInt(disp);
+                }
+            }
+        } else {
+            if (index.isValid()) {
+                int indexenc = encode(index) << 3;
+                // [indexscale + disp]
+                // [00 reg 100][ss index 101] disp32
+                assert !index.equals(rsp) : "illegal addressing mode";
+                emitByte(0x04 | regenc);
+                emitByte(scale.log2 << 6 | indexenc | 0x05);
+                emitInt(disp);
+            } else {
+                // [disp] ABSOLUTE
+                // [00 reg 100][00 100 101] disp32
+                emitByte(0x04 | regenc);
+                emitByte(0x25);
+                emitInt(disp);
+            }
+        }
+        setCurAttributes(null);
+    }
+
+    /**
+     * Base class for AMD64 opcodes.
+     */
+    public static class AMD64Op {
+
+        protected static final int P_0F = 0x0F;
+        protected static final int P_0F38 = 0x380F;
+        protected static final int P_0F3A = 0x3A0F;
+
+        private final String opcode;
+
+        protected final int prefix1;
+        protected final int prefix2;
+        protected final int op;
+
+        private final boolean dstIsByte;
+        private final boolean srcIsByte;
+
+        private final OpAssertion assertion;
+        private final CPUFeature feature;
+
+        protected AMD64Op(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
+            this(opcode, prefix1, prefix2, op, assertion == OpAssertion.ByteAssertion, assertion == OpAssertion.ByteAssertion, assertion, feature);
+        }
+
+        protected AMD64Op(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
+            this.opcode = opcode;
+            this.prefix1 = prefix1;
+            this.prefix2 = prefix2;
+            this.op = op;
+
+            this.dstIsByte = dstIsByte;
+            this.srcIsByte = srcIsByte;
+
+            this.assertion = assertion;
+            this.feature = feature;
+        }
+
+        protected final void emitOpcode(AMD64Assembler asm, OperandSize size, int rxb, int dstEnc, int srcEnc) {
+            if (prefix1 != 0) {
+                asm.emitByte(prefix1);
+            }
+            if (size.sizePrefix != 0) {
+                asm.emitByte(size.sizePrefix);
+            }
+            int rexPrefix = 0x40 | rxb;
+            if (size == QWORD) {
+                rexPrefix |= 0x08;
+            }
+            if (rexPrefix != 0x40 || (dstIsByte && dstEnc >= 4) || (srcIsByte && srcEnc >= 4)) {
+                asm.emitByte(rexPrefix);
+            }
+            if (prefix2 > 0xFF) {
+                asm.emitShort(prefix2);
+            } else if (prefix2 > 0) {
+                asm.emitByte(prefix2);
+            }
+            asm.emitByte(op);
+        }
+
+        protected final boolean verify(AMD64Assembler asm, OperandSize size, Register resultReg, Register inputReg) {
+            assert feature == null || asm.supports(feature) : String.format("unsupported feature %s required for %s", feature, opcode);
+            assert assertion.checkOperands(this, size, resultReg, inputReg);
+            return true;
+        }
+
+        @Override
+        public String toString() {
+            return opcode;
+        }
+    }
+
+    /**
+     * Base class for AMD64 opcodes with immediate operands.
+     */
+    public static class AMD64ImmOp extends AMD64Op {
+
+        private final boolean immIsByte;
+
+        protected AMD64ImmOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
+            super(opcode, 0, prefix, op, assertion, null);
+            this.immIsByte = immIsByte;
+        }
+
+        protected final void emitImmediate(AMD64Assembler asm, OperandSize size, int imm) {
+            if (immIsByte) {
+                assert imm == (byte) imm;
+                asm.emitByte(imm);
+            } else {
+                size.emitImmediate(asm, imm);
+            }
+        }
+
+        protected final int immediateSize(OperandSize size) {
+            if (immIsByte) {
+                return 1;
+            } else {
+                return size.bytes;
+            }
+        }
+    }
+
+    /**
+     * Opcode with operand order of either RM or MR for 2 address forms.
+     */
+    public abstract static class AMD64RROp extends AMD64Op {
+
+        protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, assertion, feature);
+        }
+
+        protected AMD64RROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
+        }
+
+        public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src);
+    }
+
+    /**
+     * Opcode with operand order of either RM or MR for 3 address forms.
+     */
+    public abstract static class AMD64RRROp extends AMD64Op {
+
+        protected AMD64RRROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, assertion, feature);
+        }
+
+        protected AMD64RRROp(String opcode, int prefix1, int prefix2, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, dstIsByte, srcIsByte, assertion, feature);
+        }
+
+        public abstract void emit(AMD64Assembler asm, OperandSize size, Register dst, Register nds, Register src);
+    }
+
+    /**
+     * Opcode with operand order of RM.
+     */
+    public static class AMD64RMOp extends AMD64RROp {
+        // @formatter:off
+        public static final AMD64RMOp IMUL   = new AMD64RMOp("IMUL",         P_0F, 0xAF);
+        public static final AMD64RMOp BSF    = new AMD64RMOp("BSF",          P_0F, 0xBC);
+        public static final AMD64RMOp BSR    = new AMD64RMOp("BSR",          P_0F, 0xBD);
+        public static final AMD64RMOp POPCNT = new AMD64RMOp("POPCNT", 0xF3, P_0F, 0xB8, CPUFeature.POPCNT);
+        public static final AMD64RMOp TZCNT  = new AMD64RMOp("TZCNT",  0xF3, P_0F, 0xBC, CPUFeature.BMI1);
+        public static final AMD64RMOp LZCNT  = new AMD64RMOp("LZCNT",  0xF3, P_0F, 0xBD, CPUFeature.LZCNT);
+        public static final AMD64RMOp MOVZXB = new AMD64RMOp("MOVZXB",       P_0F, 0xB6, false, true, OpAssertion.IntegerAssertion);
+        public static final AMD64RMOp MOVZX  = new AMD64RMOp("MOVZX",        P_0F, 0xB7, OpAssertion.No16BitAssertion);
+        public static final AMD64RMOp MOVSXB = new AMD64RMOp("MOVSXB",       P_0F, 0xBE, false, true, OpAssertion.IntegerAssertion);
+        public static final AMD64RMOp MOVSX  = new AMD64RMOp("MOVSX",        P_0F, 0xBF, OpAssertion.No16BitAssertion);
+        public static final AMD64RMOp MOVSXD = new AMD64RMOp("MOVSXD",             0x63, OpAssertion.QwordOnlyAssertion);
+        public static final AMD64RMOp MOVB   = new AMD64RMOp("MOVB",               0x8A, OpAssertion.ByteAssertion);
+        public static final AMD64RMOp MOV    = new AMD64RMOp("MOV",                0x8B);
+
+        // MOVD/MOVQ and MOVSS/MOVSD are the same opcode, just with different operand size prefix
+        public static final AMD64RMOp MOVD   = new AMD64RMOp("MOVD",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatingAssertion, CPUFeature.SSE2);
+        public static final AMD64RMOp MOVQ   = new AMD64RMOp("MOVQ",   0x66, P_0F, 0x6E, OpAssertion.IntToFloatingAssertion, CPUFeature.SSE2);
+        public static final AMD64RMOp MOVSS  = new AMD64RMOp("MOVSS",        P_0F, 0x10, OpAssertion.FloatingAssertion, CPUFeature.SSE);
+        public static final AMD64RMOp MOVSD  = new AMD64RMOp("MOVSD",        P_0F, 0x10, OpAssertion.FloatingAssertion, CPUFeature.SSE);
+
+        // TEST is documented as MR operation, but it's symmetric, and using it as RM operation is more convenient.
+        public static final AMD64RMOp TESTB  = new AMD64RMOp("TEST",               0x84, OpAssertion.ByteAssertion);
+        public static final AMD64RMOp TEST   = new AMD64RMOp("TEST",               0x85);
+        // @formatter:on
+
+        protected AMD64RMOp(String opcode, int op) {
+            this(opcode, 0, op);
+        }
+
+        protected AMD64RMOp(String opcode, int op, OpAssertion assertion) {
+            this(opcode, 0, op, assertion);
+        }
+
+        protected AMD64RMOp(String opcode, int prefix, int op) {
+            this(opcode, 0, prefix, op, null);
+        }
+
+        protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion) {
+            this(opcode, 0, prefix, op, assertion, null);
+        }
+
+        protected AMD64RMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
+            this(opcode, 0, prefix, op, assertion, feature);
+        }
+
+        protected AMD64RMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
+            super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
+        }
+
+        protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
+            this(opcode, prefix1, prefix2, op, OpAssertion.IntegerAssertion, feature);
+        }
+
+        protected AMD64RMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, assertion, feature);
+        }
+
+        @Override
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
+            assert verify(asm, size, dst, src);
+            boolean isSimd = false;
+            boolean noNds = false;
+
+            switch (op) {
+                case 0x2A:
+                case 0x2C:
+                case 0x2E:
+                case 0x5A:
+                case 0x6E:
+                    isSimd = true;
+                    noNds = true;
+                    break;
+                case 0x10:
+                case 0x51:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5C:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                    isSimd = true;
+                    break;
+            }
+
+            if (isSimd) {
+                int pre;
+                int opc;
+                boolean rexVexW = (size == QWORD) ? true : false;
+                AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, rexVexW, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+                int curPrefix = size.sizePrefix | prefix1;
+                switch (curPrefix) {
+                    case 0x66:
+                        pre = VexSimdPrefix.VEX_SIMD_66;
+                        break;
+                    case 0xF2:
+                        pre = VexSimdPrefix.VEX_SIMD_F2;
+                        break;
+                    case 0xF3:
+                        pre = VexSimdPrefix.VEX_SIMD_F3;
+                        break;
+                    default:
+                        pre = VexSimdPrefix.VEX_SIMD_NONE;
+                        break;
+                }
+                switch (prefix2) {
+                    case P_0F:
+                        opc = VexOpcode.VEX_OPCODE_0F;
+                        break;
+                    case P_0F38:
+                        opc = VexOpcode.VEX_OPCODE_0F_38;
+                        break;
+                    case P_0F3A:
+                        opc = VexOpcode.VEX_OPCODE_0F_3A;
+                        break;
+                    default:
+                        opc = VexOpcode.VEX_OPCODE_NONE;
+                        break;
+                }
+                int encode;
+                if (noNds) {
+                    encode = asm.simdPrefixAndEncode(dst, Register.None, src, pre, opc, attributes);
+                } else {
+                    encode = asm.simdPrefixAndEncode(dst, dst, src, pre, opc, attributes);
+                }
+                asm.emitByte(op);
+                asm.emitByte(0xC0 | encode);
+            } else {
+                emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
+                asm.emitModRM(dst, src);
+            }
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src) {
+            assert verify(asm, size, dst, null);
+            boolean isSimd = false;
+            boolean noNds = false;
+
+            switch (op) {
+                case 0x10:
+                case 0x2A:
+                case 0x2C:
+                case 0x2E:
+                case 0x6E:
+                    isSimd = true;
+                    noNds = true;
+                    break;
+                case 0x51:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5C:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                    isSimd = true;
+                    break;
+            }
+
+            if (isSimd) {
+                int pre;
+                int opc;
+                boolean rexVexW = (size == QWORD) ? true : false;
+                AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, rexVexW, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+                int curPrefix = size.sizePrefix | prefix1;
+                switch (curPrefix) {
+                    case 0x66:
+                        pre = VexSimdPrefix.VEX_SIMD_66;
+                        break;
+                    case 0xF2:
+                        pre = VexSimdPrefix.VEX_SIMD_F2;
+                        break;
+                    case 0xF3:
+                        pre = VexSimdPrefix.VEX_SIMD_F3;
+                        break;
+                    default:
+                        pre = VexSimdPrefix.VEX_SIMD_NONE;
+                        break;
+                }
+                switch (prefix2) {
+                    case P_0F:
+                        opc = VexOpcode.VEX_OPCODE_0F;
+                        break;
+                    case P_0F38:
+                        opc = VexOpcode.VEX_OPCODE_0F_38;
+                        break;
+                    case P_0F3A:
+                        opc = VexOpcode.VEX_OPCODE_0F_3A;
+                        break;
+                    default:
+                        opc = VexOpcode.VEX_OPCODE_NONE;
+                        break;
+                }
+                if (noNds) {
+                    asm.simdPrefix(dst, Register.None, src, pre, opc, attributes);
+                } else {
+                    asm.simdPrefix(dst, dst, src, pre, opc, attributes);
+                }
+                asm.emitByte(op);
+                asm.emitOperandHelper(dst, src, 0);
+            } else {
+                emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
+                asm.emitOperandHelper(dst, src, 0);
+            }
+        }
+    }
+
+    /**
+     * Opcode with operand order of RM.
+     */
+    public static class AMD64RRMOp extends AMD64RRROp {
+        protected AMD64RRMOp(String opcode, int op) {
+            this(opcode, 0, op);
+        }
+
+        protected AMD64RRMOp(String opcode, int op, OpAssertion assertion) {
+            this(opcode, 0, op, assertion);
+        }
+
+        protected AMD64RRMOp(String opcode, int prefix, int op) {
+            this(opcode, 0, prefix, op, null);
+        }
+
+        protected AMD64RRMOp(String opcode, int prefix, int op, OpAssertion assertion) {
+            this(opcode, 0, prefix, op, assertion, null);
+        }
+
+        protected AMD64RRMOp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
+            this(opcode, 0, prefix, op, assertion, feature);
+        }
+
+        protected AMD64RRMOp(String opcode, int prefix, int op, boolean dstIsByte, boolean srcIsByte, OpAssertion assertion) {
+            super(opcode, 0, prefix, op, dstIsByte, srcIsByte, assertion, null);
+        }
+
+        protected AMD64RRMOp(String opcode, int prefix1, int prefix2, int op, CPUFeature feature) {
+            this(opcode, prefix1, prefix2, op, OpAssertion.IntegerAssertion, feature);
+        }
+
+        protected AMD64RRMOp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, assertion, feature);
+        }
+
+        @Override
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register nds, Register src) {
+            assert verify(asm, size, dst, src);
+            int pre;
+            int opc;
+            boolean rexVexW = (size == QWORD) ? true : false;
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, rexVexW, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+            int curPrefix = size.sizePrefix | prefix1;
+            switch (curPrefix) {
+                case 0x66:
+                    pre = VexSimdPrefix.VEX_SIMD_66;
+                    break;
+                case 0xF2:
+                    pre = VexSimdPrefix.VEX_SIMD_F2;
+                    break;
+                case 0xF3:
+                    pre = VexSimdPrefix.VEX_SIMD_F3;
+                    break;
+                default:
+                    pre = VexSimdPrefix.VEX_SIMD_NONE;
+                    break;
+            }
+            switch (prefix2) {
+                case P_0F:
+                    opc = VexOpcode.VEX_OPCODE_0F;
+                    break;
+                case P_0F38:
+                    opc = VexOpcode.VEX_OPCODE_0F_38;
+                    break;
+                case P_0F3A:
+                    opc = VexOpcode.VEX_OPCODE_0F_3A;
+                    break;
+                default:
+                    opc = VexOpcode.VEX_OPCODE_NONE;
+                    break;
+            }
+            int encode;
+            encode = asm.simdPrefixAndEncode(dst, nds, src, pre, opc, attributes);
+            asm.emitByte(op);
+            asm.emitByte(0xC0 | encode);
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register nds, AMD64Address src) {
+            assert verify(asm, size, dst, null);
+            int pre;
+            int opc;
+            boolean rexVexW = (size == QWORD) ? true : false;
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, rexVexW, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+            int curPrefix = size.sizePrefix | prefix1;
+            switch (curPrefix) {
+                case 0x66:
+                    pre = VexSimdPrefix.VEX_SIMD_66;
+                    break;
+                case 0xF2:
+                    pre = VexSimdPrefix.VEX_SIMD_F2;
+                    break;
+                case 0xF3:
+                    pre = VexSimdPrefix.VEX_SIMD_F3;
+                    break;
+                default:
+                    pre = VexSimdPrefix.VEX_SIMD_NONE;
+                    break;
+            }
+            switch (prefix2) {
+                case P_0F:
+                    opc = VexOpcode.VEX_OPCODE_0F;
+                    break;
+                case P_0F38:
+                    opc = VexOpcode.VEX_OPCODE_0F_38;
+                    break;
+                case P_0F3A:
+                    opc = VexOpcode.VEX_OPCODE_0F_3A;
+                    break;
+                default:
+                    opc = VexOpcode.VEX_OPCODE_NONE;
+                    break;
+            }
+            asm.simdPrefix(dst, nds, src, pre, opc, attributes);
+            asm.emitByte(op);
+            asm.emitOperandHelper(dst, src, 0);
+        }
+    }
+
+    /**
+     * Opcode with operand order of MR.
+     */
+    public static class AMD64MROp extends AMD64RROp {
+        // @formatter:off
+        public static final AMD64MROp MOVB   = new AMD64MROp("MOVB",               0x88, OpAssertion.ByteAssertion);
+        public static final AMD64MROp MOV    = new AMD64MROp("MOV",                0x89);
+
+        // MOVD and MOVQ are the same opcode, just with different operand size prefix
+        // Note that as MR opcodes, they have reverse operand order, so the IntToFloatingAssertion must be used.
+        public static final AMD64MROp MOVD   = new AMD64MROp("MOVD",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatingAssertion, CPUFeature.SSE2);
+        public static final AMD64MROp MOVQ   = new AMD64MROp("MOVQ",   0x66, P_0F, 0x7E, OpAssertion.IntToFloatingAssertion, CPUFeature.SSE2);
+
+        // MOVSS and MOVSD are the same opcode, just with different operand size prefix
+        public static final AMD64MROp MOVSS  = new AMD64MROp("MOVSS",        P_0F, 0x11, OpAssertion.FloatingAssertion, CPUFeature.SSE);
+        public static final AMD64MROp MOVSD  = new AMD64MROp("MOVSD",        P_0F, 0x11, OpAssertion.FloatingAssertion, CPUFeature.SSE);
+        // @formatter:on
+
+        protected AMD64MROp(String opcode, int op) {
+            this(opcode, 0, op);
+        }
+
+        protected AMD64MROp(String opcode, int op, OpAssertion assertion) {
+            this(opcode, 0, op, assertion);
+        }
+
+        protected AMD64MROp(String opcode, int prefix, int op) {
+            this(opcode, prefix, op, OpAssertion.IntegerAssertion);
+        }
+
+        protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion) {
+            this(opcode, prefix, op, assertion, null);
+        }
+
+        protected AMD64MROp(String opcode, int prefix, int op, OpAssertion assertion, CPUFeature feature) {
+            this(opcode, 0, prefix, op, assertion, feature);
+        }
+
+        protected AMD64MROp(String opcode, int prefix1, int prefix2, int op, OpAssertion assertion, CPUFeature feature) {
+            super(opcode, prefix1, prefix2, op, assertion, feature);
+        }
+
+        @Override
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src) {
+            assert verify(asm, size, src, dst);
+            boolean isSimd = false;
+            boolean noNds = false;
+
+            switch (op) {
+                case 0x7E:
+                    isSimd = true;
+                    noNds = true;
+                    break;
+                case 0x11:
+                    isSimd = true;
+                    break;
+            }
+
+            if (isSimd) {
+                int pre;
+                int opc;
+                boolean rexVexW = (size == QWORD) ? true : false;
+                AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, rexVexW, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+                int curPrefix = size.sizePrefix | prefix1;
+                switch (curPrefix) {
+                    case 0x66:
+                        pre = VexSimdPrefix.VEX_SIMD_66;
+                        break;
+                    case 0xF2:
+                        pre = VexSimdPrefix.VEX_SIMD_F2;
+                        break;
+                    case 0xF3:
+                        pre = VexSimdPrefix.VEX_SIMD_F3;
+                        break;
+                    default:
+                        pre = VexSimdPrefix.VEX_SIMD_NONE;
+                        break;
+                }
+                switch (prefix2) {
+                    case P_0F:
+                        opc = VexOpcode.VEX_OPCODE_0F;
+                        break;
+                    case P_0F38:
+                        opc = VexOpcode.VEX_OPCODE_0F_38;
+                        break;
+                    case P_0F3A:
+                        opc = VexOpcode.VEX_OPCODE_0F_3A;
+                        break;
+                    default:
+                        opc = VexOpcode.VEX_OPCODE_NONE;
+                        break;
+                }
+                int encode;
+                if (noNds) {
+                    encode = asm.simdPrefixAndEncode(src, Register.None, dst, pre, opc, attributes);
+                } else {
+                    encode = asm.simdPrefixAndEncode(src, src, dst, pre, opc, attributes);
+                }
+                asm.emitByte(op);
+                asm.emitByte(0xC0 | encode);
+            } else {
+                emitOpcode(asm, size, getRXB(src, dst), src.encoding, dst.encoding);
+                asm.emitModRM(src, dst);
+            }
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, Register src) {
+            assert verify(asm, size, null, src);
+            boolean isSimd = false;
+
+            switch (op) {
+                case 0x7E:
+                case 0x11:
+                    isSimd = true;
+                    break;
+            }
+
+            if (isSimd) {
+                int pre;
+                int opc;
+                boolean rexVexW = (size == QWORD) ? true : false;
+                AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, rexVexW, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+                int curPrefix = size.sizePrefix | prefix1;
+                switch (curPrefix) {
+                    case 0x66:
+                        pre = VexSimdPrefix.VEX_SIMD_66;
+                        break;
+                    case 0xF2:
+                        pre = VexSimdPrefix.VEX_SIMD_F2;
+                        break;
+                    case 0xF3:
+                        pre = VexSimdPrefix.VEX_SIMD_F3;
+                        break;
+                    default:
+                        pre = VexSimdPrefix.VEX_SIMD_NONE;
+                        break;
+                }
+                switch (prefix2) {
+                    case P_0F:
+                        opc = VexOpcode.VEX_OPCODE_0F;
+                        break;
+                    case P_0F38:
+                        opc = VexOpcode.VEX_OPCODE_0F_38;
+                        break;
+                    case P_0F3A:
+                        opc = VexOpcode.VEX_OPCODE_0F_3A;
+                        break;
+                    default:
+                        opc = VexOpcode.VEX_OPCODE_NONE;
+                        break;
+                }
+                asm.simdPrefix(src, Register.None, dst, pre, opc, attributes);
+                asm.emitByte(op);
+                asm.emitOperandHelper(src, dst, 0);
+            } else {
+                emitOpcode(asm, size, getRXB(src, dst), src.encoding, 0);
+                asm.emitOperandHelper(src, dst, 0);
+            }
+        }
+    }
+
+    /**
+     * Opcodes with operand order of M.
+     */
+    public static class AMD64MOp extends AMD64Op {
+        // @formatter:off
+        public static final AMD64MOp NOT  = new AMD64MOp("NOT",  0xF7, 2);
+        public static final AMD64MOp NEG  = new AMD64MOp("NEG",  0xF7, 3);
+        public static final AMD64MOp MUL  = new AMD64MOp("MUL",  0xF7, 4);
+        public static final AMD64MOp IMUL = new AMD64MOp("IMUL", 0xF7, 5);
+        public static final AMD64MOp DIV  = new AMD64MOp("DIV",  0xF7, 6);
+        public static final AMD64MOp IDIV = new AMD64MOp("IDIV", 0xF7, 7);
+        public static final AMD64MOp INC  = new AMD64MOp("INC",  0xFF, 0);
+        public static final AMD64MOp DEC  = new AMD64MOp("DEC",  0xFF, 1);
+        public static final AMD64MOp PUSH = new AMD64MOp("PUSH", 0xFF, 6);
+        public static final AMD64MOp POP  = new AMD64MOp("POP",  0x8F, 0, OpAssertion.No32BitAssertion);
+        // @formatter:on
+
+        private final int ext;
+
+        protected AMD64MOp(String opcode, int op, int ext) {
+            this(opcode, 0, op, ext);
+        }
+
+        protected AMD64MOp(String opcode, int prefix, int op, int ext) {
+            this(opcode, prefix, op, ext, OpAssertion.IntegerAssertion);
+        }
+
+        protected AMD64MOp(String opcode, int op, int ext, OpAssertion assertion) {
+            this(opcode, 0, op, ext, assertion);
+        }
+
+        protected AMD64MOp(String opcode, int prefix, int op, int ext, OpAssertion assertion) {
+            super(opcode, 0, prefix, op, assertion, null);
+            this.ext = ext;
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst) {
+            assert verify(asm, size, dst, null);
+            emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
+            asm.emitModRM(ext, dst);
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst) {
+            assert verify(asm, size, null, null);
+            emitOpcode(asm, size, getRXB(null, dst), 0, 0);
+            asm.emitOperandHelper(ext, dst, 0);
+        }
+    }
+
+    /**
+     * Opcodes with operand order of MI.
+     */
+    public static class AMD64MIOp extends AMD64ImmOp {
+        // @formatter:off
+        public static final AMD64MIOp MOVB = new AMD64MIOp("MOVB", true,  0xC6, 0, OpAssertion.ByteAssertion);
+        public static final AMD64MIOp MOV  = new AMD64MIOp("MOV",  false, 0xC7, 0);
+        public static final AMD64MIOp TEST = new AMD64MIOp("TEST", false, 0xF7, 0);
+        // @formatter:on
+
+        private final int ext;
+
+        protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext) {
+            this(opcode, immIsByte, op, ext, OpAssertion.IntegerAssertion);
+        }
+
+        protected AMD64MIOp(String opcode, boolean immIsByte, int op, int ext, OpAssertion assertion) {
+            this(opcode, immIsByte, 0, op, ext, assertion);
+        }
+
+        protected AMD64MIOp(String opcode, boolean immIsByte, int prefix, int op, int ext, OpAssertion assertion) {
+            super(opcode, immIsByte, prefix, op, assertion);
+            this.ext = ext;
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, int imm) {
+            assert verify(asm, size, dst, null);
+            emitOpcode(asm, size, getRXB(null, dst), 0, dst.encoding);
+            asm.emitModRM(ext, dst);
+            emitImmediate(asm, size, imm);
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, AMD64Address dst, int imm) {
+            assert verify(asm, size, null, null);
+            emitOpcode(asm, size, getRXB(null, dst), 0, 0);
+            asm.emitOperandHelper(ext, dst, immediateSize(size));
+            emitImmediate(asm, size, imm);
+        }
+    }
+
+    /**
+     * Opcodes with operand order of RMI.
+     *
+     * We only have one form of round as the operation is always treated with single variant input,
+     * making its extension to 3 address forms redundant.
+     */
+    public static class AMD64RMIOp extends AMD64ImmOp {
+        // @formatter:off
+        public static final AMD64RMIOp IMUL    = new AMD64RMIOp("IMUL", false, 0x69);
+        public static final AMD64RMIOp IMUL_SX = new AMD64RMIOp("IMUL", true,  0x6B);
+        public static final AMD64RMIOp ROUNDSS = new AMD64RMIOp("ROUNDSS", true, P_0F3A, 0x0A, OpAssertion.PackedDoubleAssertion);
+        public static final AMD64RMIOp ROUNDSD = new AMD64RMIOp("ROUNDSD", true, P_0F3A, 0x0B, OpAssertion.PackedDoubleAssertion);
+        // @formatter:on
+
+        protected AMD64RMIOp(String opcode, boolean immIsByte, int op) {
+            this(opcode, immIsByte, 0, op, OpAssertion.IntegerAssertion);
+        }
+
+        protected AMD64RMIOp(String opcode, boolean immIsByte, int prefix, int op, OpAssertion assertion) {
+            super(opcode, immIsByte, prefix, op, assertion);
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, Register src, int imm) {
+            assert verify(asm, size, dst, src);
+            boolean isSimd = false;
+            boolean noNds = false;
+
+            switch (op) {
+                case 0x0A:
+                case 0x0B:
+                    isSimd = true;
+                    noNds = true;
+                    break;
+            }
+
+            if (isSimd) {
+                int pre;
+                int opc;
+                AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+                int curPrefix = size.sizePrefix | prefix1;
+                switch (curPrefix) {
+                    case 0x66:
+                        pre = VexSimdPrefix.VEX_SIMD_66;
+                        break;
+                    case 0xF2:
+                        pre = VexSimdPrefix.VEX_SIMD_F2;
+                        break;
+                    case 0xF3:
+                        pre = VexSimdPrefix.VEX_SIMD_F3;
+                        break;
+                    default:
+                        pre = VexSimdPrefix.VEX_SIMD_NONE;
+                        break;
+                }
+                switch (prefix2) {
+                    case P_0F:
+                        opc = VexOpcode.VEX_OPCODE_0F;
+                        break;
+                    case P_0F38:
+                        opc = VexOpcode.VEX_OPCODE_0F_38;
+                        break;
+                    case P_0F3A:
+                        opc = VexOpcode.VEX_OPCODE_0F_3A;
+                        break;
+                    default:
+                        opc = VexOpcode.VEX_OPCODE_NONE;
+                        break;
+                }
+                int encode;
+                if (noNds) {
+                    encode = asm.simdPrefixAndEncode(dst, Register.None, src, pre, opc, attributes);
+                } else {
+                    encode = asm.simdPrefixAndEncode(dst, dst, src, pre, opc, attributes);
+                }
+                asm.emitByte(op);
+                asm.emitByte(0xC0 | encode);
+                emitImmediate(asm, size, imm);
+            } else {
+                emitOpcode(asm, size, getRXB(dst, src), dst.encoding, src.encoding);
+                asm.emitModRM(dst, src);
+                emitImmediate(asm, size, imm);
+            }
+        }
+
+        public final void emit(AMD64Assembler asm, OperandSize size, Register dst, AMD64Address src, int imm) {
+            assert verify(asm, size, dst, null);
+
+            boolean isSimd = false;
+            boolean noNds = false;
+
+            switch (op) {
+                case 0x0A:
+                case 0x0B:
+                    isSimd = true;
+                    noNds = true;
+                    break;
+            }
+
+            if (isSimd) {
+                int pre;
+                int opc;
+                AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, asm.target);
+                int curPrefix = size.sizePrefix | prefix1;
+                switch (curPrefix) {
+                    case 0x66:
+                        pre = VexSimdPrefix.VEX_SIMD_66;
+                        break;
+                    case 0xF2:
+                        pre = VexSimdPrefix.VEX_SIMD_F2;
+                        break;
+                    case 0xF3:
+                        pre = VexSimdPrefix.VEX_SIMD_F3;
+                        break;
+                    default:
+                        pre = VexSimdPrefix.VEX_SIMD_NONE;
+                        break;
+                }
+                switch (prefix2) {
+                    case P_0F:
+                        opc = VexOpcode.VEX_OPCODE_0F;
+                        break;
+                    case P_0F38:
+                        opc = VexOpcode.VEX_OPCODE_0F_38;
+                        break;
+                    case P_0F3A:
+                        opc = VexOpcode.VEX_OPCODE_0F_3A;
+                        break;
+                    default:
+                        opc = VexOpcode.VEX_OPCODE_NONE;
+                        break;
+                }
+
+                if (noNds) {
+                    asm.simdPrefix(dst, Register.None, src, pre, opc, attributes);
+                } else {
+                    asm.simdPrefix(dst, dst, src, pre, opc, attributes);
+                }
+                asm.emitByte(op);
+                asm.emitOperandHelper(dst, src, immediateSize(size));
+                emitImmediate(asm, size, imm);
+            } else {
+                emitOpcode(asm, size, getRXB(dst, src), dst.encoding, 0);
+                asm.emitOperandHelper(dst, src, immediateSize(size));
+                emitImmediate(asm, size, imm);
+            }
+        }
+    }
+
+    public static class SSEOp extends AMD64RMOp {
+        // @formatter:off
+        public static final SSEOp CVTSI2SS  = new SSEOp("CVTSI2SS",  0xF3, P_0F, 0x2A, OpAssertion.IntToFloatingAssertion);
+        public static final SSEOp CVTSI2SD  = new SSEOp("CVTSI2SS",  0xF2, P_0F, 0x2A, OpAssertion.IntToFloatingAssertion);
+        public static final SSEOp CVTTSS2SI = new SSEOp("CVTTSS2SI", 0xF3, P_0F, 0x2C, OpAssertion.FloatingToIntAssertion);
+        public static final SSEOp CVTTSD2SI = new SSEOp("CVTTSD2SI", 0xF2, P_0F, 0x2C, OpAssertion.FloatingToIntAssertion);
+        public static final SSEOp UCOMIS    = new SSEOp("UCOMIS",          P_0F, 0x2E, OpAssertion.PackedFloatingAssertion);
+        public static final SSEOp SQRT      = new SSEOp("SQRT",            P_0F, 0x51);
+        public static final SSEOp AND       = new SSEOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatingAssertion);
+        public static final SSEOp ANDN      = new SSEOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatingAssertion);
+        public static final SSEOp OR        = new SSEOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatingAssertion);
+        public static final SSEOp XOR       = new SSEOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatingAssertion);
+        public static final SSEOp ADD       = new SSEOp("ADD",             P_0F, 0x58);
+        public static final SSEOp MUL       = new SSEOp("MUL",             P_0F, 0x59);
+        public static final SSEOp CVTSS2SD  = new SSEOp("CVTSS2SD",        P_0F, 0x5A, OpAssertion.SingleAssertion);
+        public static final SSEOp CVTSD2SS  = new SSEOp("CVTSD2SS",        P_0F, 0x5A, OpAssertion.DoubleAssertion);
+        public static final SSEOp SUB       = new SSEOp("SUB",             P_0F, 0x5C);
+        public static final SSEOp MIN       = new SSEOp("MIN",             P_0F, 0x5D);
+        public static final SSEOp DIV       = new SSEOp("DIV",             P_0F, 0x5E);
+        public static final SSEOp MAX       = new SSEOp("MAX",             P_0F, 0x5F);
+        // @formatter:on
+
+        protected SSEOp(String opcode, int prefix, int op) {
+            this(opcode, prefix, op, OpAssertion.FloatingAssertion);
+        }
+
+        protected SSEOp(String opcode, int prefix, int op, OpAssertion assertion) {
+            this(opcode, 0, prefix, op, assertion);
+        }
+
+        protected SSEOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
+            super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.SSE2);
+        }
+    }
+
+    public static class AVXOp extends AMD64RRMOp {
+        // @formatter:off
+        public static final AVXOp AND       = new AVXOp("AND",             P_0F, 0x54, OpAssertion.PackedFloatingAssertion);
+        public static final AVXOp ANDN      = new AVXOp("ANDN",            P_0F, 0x55, OpAssertion.PackedFloatingAssertion);
+        public static final AVXOp OR        = new AVXOp("OR",              P_0F, 0x56, OpAssertion.PackedFloatingAssertion);
+        public static final AVXOp XOR       = new AVXOp("XOR",             P_0F, 0x57, OpAssertion.PackedFloatingAssertion);
+        public static final AVXOp ADD       = new AVXOp("ADD",             P_0F, 0x58);
+        public static final AVXOp MUL       = new AVXOp("MUL",             P_0F, 0x59);
+        public static final AVXOp SUB       = new AVXOp("SUB",             P_0F, 0x5C);
+        public static final AVXOp MIN       = new AVXOp("MIN",             P_0F, 0x5D);
+        public static final AVXOp DIV       = new AVXOp("DIV",             P_0F, 0x5E);
+        public static final AVXOp MAX       = new AVXOp("MAX",             P_0F, 0x5F);
+        // @formatter:on
+
+        protected AVXOp(String opcode, int prefix, int op) {
+            this(opcode, prefix, op, OpAssertion.FloatingAssertion);
+        }
+
+        protected AVXOp(String opcode, int prefix, int op, OpAssertion assertion) {
+            this(opcode, 0, prefix, op, assertion);
+        }
+
+        protected AVXOp(String opcode, int mandatoryPrefix, int prefix, int op, OpAssertion assertion) {
+            super(opcode, mandatoryPrefix, prefix, op, assertion, CPUFeature.AVX);
+        }
+    }
+
+    /**
+     * Arithmetic operation with operand order of RM, MR or MI.
+     */
+    public static final class AMD64BinaryArithmetic {
+        // @formatter:off
+        public static final AMD64BinaryArithmetic ADD = new AMD64BinaryArithmetic("ADD", 0);
+        public static final AMD64BinaryArithmetic OR  = new AMD64BinaryArithmetic("OR",  1);
+        public static final AMD64BinaryArithmetic ADC = new AMD64BinaryArithmetic("ADC", 2);
+        public static final AMD64BinaryArithmetic SBB = new AMD64BinaryArithmetic("SBB", 3);
+        public static final AMD64BinaryArithmetic AND = new AMD64BinaryArithmetic("AND", 4);
+        public static final AMD64BinaryArithmetic SUB = new AMD64BinaryArithmetic("SUB", 5);
+        public static final AMD64BinaryArithmetic XOR = new AMD64BinaryArithmetic("XOR", 6);
+        public static final AMD64BinaryArithmetic CMP = new AMD64BinaryArithmetic("CMP", 7);
+        // @formatter:on
+
+        private final AMD64MIOp byteImmOp;
+        private final AMD64MROp byteMrOp;
+        private final AMD64RMOp byteRmOp;
+
+        private final AMD64MIOp immOp;
+        private final AMD64MIOp immSxOp;
+        private final AMD64MROp mrOp;
+        private final AMD64RMOp rmOp;
+
+        private AMD64BinaryArithmetic(String opcode, int code) {
+            int baseOp = code << 3;
+
+            byteImmOp = new AMD64MIOp(opcode, true, 0, 0x80, code, OpAssertion.ByteAssertion);
+            byteMrOp = new AMD64MROp(opcode, 0, baseOp, OpAssertion.ByteAssertion);
+            byteRmOp = new AMD64RMOp(opcode, 0, baseOp | 0x02, OpAssertion.ByteAssertion);
+
+            immOp = new AMD64MIOp(opcode, false, 0, 0x81, code, OpAssertion.IntegerAssertion);
+            immSxOp = new AMD64MIOp(opcode, true, 0, 0x83, code, OpAssertion.IntegerAssertion);
+            mrOp = new AMD64MROp(opcode, 0, baseOp | 0x01, OpAssertion.IntegerAssertion);
+            rmOp = new AMD64RMOp(opcode, 0, baseOp | 0x03, OpAssertion.IntegerAssertion);
+        }
+
+        public AMD64MIOp getMIOpcode(OperandSize size, boolean sx) {
+            if (size == BYTE) {
+                return byteImmOp;
+            } else if (sx) {
+                return immSxOp;
+            } else {
+                return immOp;
+            }
+        }
+
+        public AMD64MROp getMROpcode(OperandSize size) {
+            if (size == BYTE) {
+                return byteMrOp;
+            } else {
+                return mrOp;
+            }
+        }
+
+        public AMD64RMOp getRMOpcode(OperandSize size) {
+            if (size == BYTE) {
+                return byteRmOp;
+            } else {
+                return rmOp;
+            }
+        }
+    }
+
+    /**
+     * Shift operation with operand order of M1, MC or MI.
+     */
+    public static final class AMD64Shift {
+        // @formatter:off
+        public static final AMD64Shift ROL = new AMD64Shift("ROL", 0);
+        public static final AMD64Shift ROR = new AMD64Shift("ROR", 1);
+        public static final AMD64Shift RCL = new AMD64Shift("RCL", 2);
+        public static final AMD64Shift RCR = new AMD64Shift("RCR", 3);
+        public static final AMD64Shift SHL = new AMD64Shift("SHL", 4);
+        public static final AMD64Shift SHR = new AMD64Shift("SHR", 5);
+        public static final AMD64Shift SAR = new AMD64Shift("SAR", 7);
+        // @formatter:on
+
+        public final AMD64MOp m1Op;
+        public final AMD64MOp mcOp;
+        public final AMD64MIOp miOp;
+
+        private AMD64Shift(String opcode, int code) {
+            m1Op = new AMD64MOp(opcode, 0, 0xD1, code, OpAssertion.IntegerAssertion);
+            mcOp = new AMD64MOp(opcode, 0, 0xD3, code, OpAssertion.IntegerAssertion);
+            miOp = new AMD64MIOp(opcode, true, 0, 0xC1, code, OpAssertion.IntegerAssertion);
+        }
+    }
+
+    public final void addl(AMD64Address dst, int imm32) {
+        ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void addl(Register dst, int imm32) {
+        ADD.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void addl(Register dst, Register src) {
+        ADD.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void addpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x58);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void addpd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x58);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void addsd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x58);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void addsd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x58);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    private void addrNop4() {
+        // 4 bytes: NOP DWORD PTR [EAX+0]
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
+        emitByte(0); // 8-bits offset (1 byte)
+    }
+
+    private void addrNop5() {
+        // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
+        emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
+        emitByte(0); // 8-bits offset (1 byte)
+    }
+
+    private void addrNop7() {
+        // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
+        emitInt(0); // 32-bits offset (4 bytes)
+    }
+
+    private void addrNop8() {
+        // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
+        emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
+        emitInt(0); // 32-bits offset (4 bytes)
+    }
+
+    public final void andl(Register dst, int imm32) {
+        AND.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void andl(Register dst, Register src) {
+        AND.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void andpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x54);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void andpd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x54);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void bsrl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding(), src.encoding());
+        emitByte(0x0F);
+        emitByte(0xBD);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void bswapl(Register reg) {
+        int encode = prefixAndEncode(reg.encoding);
+        emitByte(0x0F);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void cdql() {
+        emitByte(0x99);
+    }
+
+    public final void cmovl(ConditionFlag cc, Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.getValue());
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cmovl(ConditionFlag cc, Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.getValue());
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void cmpl(Register dst, int imm32) {
+        CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void cmpl(Register dst, Register src) {
+        CMP.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void cmpl(Register dst, AMD64Address src) {
+        CMP.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void cmpl(AMD64Address dst, int imm32) {
+        CMP.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    // The 32-bit cmpxchg compares the value at adr with the contents of X86.rax,
+    // and stores reg into adr if so; otherwise, the value at adr is loaded into X86.rax,.
+    // The ZF is set if the compared values were equal, and cleared otherwise.
+    public final void cmpxchgl(Register reg, AMD64Address adr) { // cmpxchg
+        prefix(adr, reg);
+        emitByte(0x0F);
+        emitByte(0xB1);
+        emitOperandHelper(reg, adr, 0);
+    }
+
+    public final void cvtsi2sdl(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.CPU);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x2A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttsd2sil(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.CPU) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x2C);
+        emitByte(0xC0 | encode);
+    }
+
+    protected final void decl(AMD64Address dst) {
+        prefix(dst);
+        emitByte(0xFF);
+        emitOperandHelper(1, dst, 0);
+    }
+
+    public final void divsd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x5E);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void hlt() {
+        emitByte(0xF4);
+    }
+
+    public final void imull(Register dst, Register src, int value) {
+        if (isByte(value)) {
+            AMD64RMIOp.IMUL_SX.emit(this, DWORD, dst, src, value);
+        } else {
+            AMD64RMIOp.IMUL.emit(this, DWORD, dst, src, value);
+        }
+    }
+
+    protected final void incl(AMD64Address dst) {
+        prefix(dst);
+        emitByte(0xFF);
+        emitOperandHelper(0, dst, 0);
+    }
+
+    public void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
+        int shortSize = 2;
+        int longSize = 6;
+        long disp = jumpTarget - position();
+        if (!forceDisp32 && isByte(disp - shortSize)) {
+            // 0111 tttn #8-bit disp
+            emitByte(0x70 | cc.getValue());
+            emitByte((int) ((disp - shortSize) & 0xFF));
+        } else {
+            // 0000 1111 1000 tttn #32-bit disp
+            assert isInt(disp - longSize) : "must be 32bit offset (call4)";
+            emitByte(0x0F);
+            emitByte(0x80 | cc.getValue());
+            emitInt((int) (disp - longSize));
+        }
+    }
+
+    public final void jcc(ConditionFlag cc, Label l) {
+        assert (0 <= cc.getValue()) && (cc.getValue() < 16) : "illegal cc";
+        if (l.isBound()) {
+            jcc(cc, l.position(), false);
+        } else {
+            // Note: could eliminate cond. jumps to this jump if condition
+            // is the same however, seems to be rather unlikely case.
+            // Note: use jccb() if label to be bound is very close to get
+            // an 8-bit displacement
+            l.addPatchAt(position());
+            emitByte(0x0F);
+            emitByte(0x80 | cc.getValue());
+            emitInt(0);
+        }
+
+    }
+
+    public final void jccb(ConditionFlag cc, Label l) {
+        if (l.isBound()) {
+            int shortSize = 2;
+            int entry = l.position();
+            assert isByte(entry - (position() + shortSize)) : "Dispacement too large for a short jmp";
+            long disp = entry - position();
+            // 0111 tttn #8-bit disp
+            emitByte(0x70 | cc.getValue());
+            emitByte((int) ((disp - shortSize) & 0xFF));
+        } else {
+            l.addPatchAt(position());
+            emitByte(0x70 | cc.getValue());
+            emitByte(0);
+        }
+    }
+
+    public final void jmp(int jumpTarget, boolean forceDisp32) {
+        int shortSize = 2;
+        int longSize = 5;
+        long disp = jumpTarget - position();
+        if (!forceDisp32 && isByte(disp - shortSize)) {
+            emitByte(0xEB);
+            emitByte((int) ((disp - shortSize) & 0xFF));
+        } else {
+            emitByte(0xE9);
+            emitInt((int) (disp - longSize));
+        }
+    }
+
+    @Override
+    public final void jmp(Label l) {
+        if (l.isBound()) {
+            jmp(l.position(), false);
+        } else {
+            // By default, forward jumps are always 32-bit displacements, since
+            // we can't yet know where the label will be bound. If you're sure that
+            // the forward jump will not run beyond 256 bytes, use jmpb to
+            // force an 8-bit displacement.
+
+            l.addPatchAt(position());
+            emitByte(0xE9);
+            emitInt(0);
+        }
+    }
+
+    public final void jmp(Register entry) {
+        int encode = prefixAndEncode(entry.encoding);
+        emitByte(0xFF);
+        emitByte(0xE0 | encode);
+    }
+
+    public final void jmp(AMD64Address adr) {
+        prefix(adr);
+        emitByte(0xFF);
+        emitOperandHelper(rsp, adr, 0);
+    }
+
+    public final void jmpb(Label l) {
+        if (l.isBound()) {
+            int shortSize = 2;
+            int entry = l.position();
+            assert isByte((entry - position()) + shortSize) : "Dispacement too large for a short jmp";
+            long offs = entry - position();
+            emitByte(0xEB);
+            emitByte((int) ((offs - shortSize) & 0xFF));
+        } else {
+
+            l.addPatchAt(position());
+            emitByte(0xEB);
+            emitByte(0);
+        }
+    }
+
+    public final void leaq(Register dst, AMD64Address src) {
+        prefixq(src, dst);
+        emitByte(0x8D);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void leave() {
+        emitByte(0xC9);
+    }
+
+    public final void lock() {
+        emitByte(0xF0);
+    }
+
+    public final void movapd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x28);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movaps(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_NONE, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x28);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movb(AMD64Address dst, int imm8) {
+        prefix(dst);
+        emitByte(0xC6);
+        emitOperandHelper(0, dst, 1);
+        emitByte(imm8);
+    }
+
+    public final void movb(AMD64Address dst, Register src) {
+        assert src.getRegisterCategory().equals(AMD64.CPU) : "must have byte register";
+        prefix(dst, src, true);
+        emitByte(0x88);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void movl(Register dst, int imm32) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xB8 | encode);
+        emitInt(imm32);
+    }
+
+    public final void movl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x8B);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x8B);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movl(AMD64Address dst, int imm32) {
+        prefix(dst);
+        emitByte(0xC7);
+        emitOperandHelper(0, dst, 4);
+        emitInt(imm32);
+    }
+
+    public final void movl(AMD64Address dst, Register src) {
+        prefix(dst, src);
+        emitByte(0x89);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    /**
+     * New CPUs require use of movsd and movss to avoid partial register stall when loading from
+     * memory. But for old Opteron use movlpd instead of movsd. The selection is done in
+     * {@link AMD64MacroAssembler#movdbl(Register, AMD64Address)} and
+     * {@link AMD64MacroAssembler#movflt(Register, Register)}.
+     */
+    public final void movlpd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x12);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movlhps(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, src, src, VexSimdPrefix.VEX_SIMD_NONE, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x16);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movq(Register dst, AMD64Address src) {
+        movq(dst, src, false);
+    }
+
+    public final void movq(Register dst, AMD64Address src, boolean wide) {
+        if (dst.getRegisterCategory().equals(AMD64.XMM)) {
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ wide, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+            simdPrefix(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+            emitByte(0x7E);
+            emitOperandHelper(dst, src, wide, 0);
+        } else {
+            // gpr version of movq
+            prefixq(src, dst);
+            emitByte(0x8B);
+            emitOperandHelper(dst, src, wide, 0);
+        }
+    }
+
+    public final void movq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x8B);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movq(AMD64Address dst, Register src) {
+        if (src.getRegisterCategory().equals(AMD64.XMM)) {
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+            simdPrefix(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+            emitByte(0xD6);
+            emitOperandHelper(src, dst, 0);
+        } else {
+            // gpr version of movq
+            prefixq(dst, src);
+            emitByte(0x89);
+            emitOperandHelper(src, dst, 0);
+        }
+    }
+
+    public final void movsbl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movsbl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, false, src.encoding, true);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movsbq(Register dst, AMD64Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movsbq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movsd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x10);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movsd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x10);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movsd(AMD64Address dst, Register src) {
+        assert src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x11);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void movss(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x10);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movss(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x10);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movss(AMD64Address dst, Register src) {
+        assert src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x11);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void mulpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x59);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void mulpd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x59);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void mulsd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x59);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void mulsd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x59);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void mulss(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x59);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movswl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBF);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movw(AMD64Address dst, int imm16) {
+        emitByte(0x66); // switch to 16-bit mode
+        prefix(dst);
+        emitByte(0xC7);
+        emitOperandHelper(0, dst, 2);
+        emitShort(imm16);
+    }
+
+    public final void movw(AMD64Address dst, Register src) {
+        emitByte(0x66);
+        prefix(dst, src);
+        emitByte(0x89);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void movzbl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xB6);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movzwl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xB7);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void negl(Register dst) {
+        NEG.emit(this, DWORD, dst);
+    }
+
+    public final void notl(Register dst) {
+        NOT.emit(this, DWORD, dst);
+    }
+
+    @Override
+    public final void ensureUniquePC() {
+        nop();
+    }
+
+    public final void nop() {
+        nop(1);
+    }
+
+    public void nop(int count) {
+        int i = count;
+        if (UseNormalNop) {
+            assert i > 0 : " ";
+            // The fancy nops aren't currently recognized by debuggers making it a
+            // pain to disassemble code while debugging. If assert are on clearly
+            // speed is not an issue so simply use the single byte traditional nop
+            // to do alignment.
+
+            for (; i > 0; i--) {
+                emitByte(0x90);
+            }
+            return;
+        }
+
+        if (UseAddressNop) {
+            //
+            // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
+            // 1: 0x90
+            // 2: 0x66 0x90
+            // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
+            // 4: 0x0F 0x1F 0x40 0x00
+            // 5: 0x0F 0x1F 0x44 0x00 0x00
+            // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
+            // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+            // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+
+            // The rest coding is AMD specific - use consecutive Address nops
+
+            // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
+            // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
+            // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+            // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+            // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // Size prefixes (0x66) are added for larger sizes
+
+            while (i >= 22) {
+                i -= 11;
+                emitByte(0x66); // size prefix
+                emitByte(0x66); // size prefix
+                emitByte(0x66); // size prefix
+                addrNop8();
+            }
+            // Generate first nop for size between 21-12
+            switch (i) {
+                case 21:
+                    i -= 11;
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 20:
+                case 19:
+                    i -= 10;
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 18:
+                case 17:
+                    i -= 9;
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 16:
+                case 15:
+                    i -= 8;
+                    addrNop8();
+                    break;
+                case 14:
+                case 13:
+                    i -= 7;
+                    addrNop7();
+                    break;
+                case 12:
+                    i -= 6;
+                    emitByte(0x66); // size prefix
+                    addrNop5();
+                    break;
+                default:
+                    assert i < 12;
+            }
+
+            // Generate second nop for size between 11-1
+            switch (i) {
+                case 11:
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 10:
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 9:
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 8:
+                    addrNop8();
+                    break;
+                case 7:
+                    addrNop7();
+                    break;
+                case 6:
+                    emitByte(0x66); // size prefix
+                    addrNop5();
+                    break;
+                case 5:
+                    addrNop5();
+                    break;
+                case 4:
+                    addrNop4();
+                    break;
+                case 3:
+                    // Don't use "0x0F 0x1F 0x00" - need patching safe padding
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    emitByte(0x90); // nop
+                    break;
+                case 2:
+                    emitByte(0x66); // size prefix
+                    emitByte(0x90); // nop
+                    break;
+                case 1:
+                    emitByte(0x90); // nop
+                    break;
+                default:
+                    assert i == 0;
+            }
+            return;
+        }
+
+        // Using nops with size prefixes "0x66 0x90".
+        // From AMD Optimization Guide:
+        // 1: 0x90
+        // 2: 0x66 0x90
+        // 3: 0x66 0x66 0x90
+        // 4: 0x66 0x66 0x66 0x90
+        // 5: 0x66 0x66 0x90 0x66 0x90
+        // 6: 0x66 0x66 0x90 0x66 0x66 0x90
+        // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
+        // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
+        // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
+        // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
+        //
+        while (i > 12) {
+            i -= 4;
+            emitByte(0x66); // size prefix
+            emitByte(0x66);
+            emitByte(0x66);
+            emitByte(0x90); // nop
+        }
+        // 1 - 12 nops
+        if (i > 8) {
+            if (i > 9) {
+                i -= 1;
+                emitByte(0x66);
+            }
+            i -= 3;
+            emitByte(0x66);
+            emitByte(0x66);
+            emitByte(0x90);
+        }
+        // 1 - 8 nops
+        if (i > 4) {
+            if (i > 6) {
+                i -= 1;
+                emitByte(0x66);
+            }
+            i -= 3;
+            emitByte(0x66);
+            emitByte(0x66);
+            emitByte(0x90);
+        }
+        switch (i) {
+            case 4:
+                emitByte(0x66);
+                emitByte(0x66);
+                emitByte(0x66);
+                emitByte(0x90);
+                break;
+            case 3:
+                emitByte(0x66);
+                emitByte(0x66);
+                emitByte(0x90);
+                break;
+            case 2:
+                emitByte(0x66);
+                emitByte(0x90);
+                break;
+            case 1:
+                emitByte(0x90);
+                break;
+            default:
+                assert i == 0;
+        }
+    }
+
+    public final void orl(Register dst, Register src) {
+        OR.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void orl(Register dst, int imm32) {
+        OR.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void pop(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0x58 | encode);
+    }
+
+    public void popfq() {
+        emitByte(0x9D);
+    }
+
+    public final void ptest(Register dst, Register src) {
+        assert supports(CPUFeature.SSE4_1);
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F_38, attributes);
+        emitByte(0x17);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void vptest(Register dst, Register src) {
+        assert supports(CPUFeature.AVX);
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_256bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = vexPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F_38, attributes);
+        emitByte(0x17);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void push(Register src) {
+        int encode = prefixAndEncode(src.encoding);
+        emitByte(0x50 | encode);
+    }
+
+    public void pushfq() {
+        emitByte(0x9c);
+    }
+
+    public final void paddd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xFE);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void paddq(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xD4);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void pextrw(Register dst, Register src, int imm8) {
+        assert dst.getRegisterCategory().equals(AMD64.CPU) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xC5);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void pinsrw(Register dst, Register src, int imm8) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.CPU);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xC4);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void por(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xEB);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void pand(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xDB);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void pxor(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xEF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void vpxor(Register dst, Register nds, Register src) {
+        assert supports(CPUFeature.AVX);
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_256bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = vexPrefixAndEncode(dst, nds, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xEF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void pslld(Register dst, int imm8) {
+        assert isUByte(imm8) : "invalid value";
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        // XMM6 is for /6 encoding: 66 0F 72 /6 ib
+        int encode = simdPrefixAndEncode(AMD64.xmm6, dst, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x72);
+        emitByte(0xC0 | encode);
+        emitByte(imm8 & 0xFF);
+    }
+
+    public final void psllq(Register dst, Register shift) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && shift.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, shift, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xF3);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void psllq(Register dst, int imm8) {
+        assert isUByte(imm8) : "invalid value";
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        // XMM6 is for /6 encoding: 66 0F 73 /6 ib
+        int encode = simdPrefixAndEncode(AMD64.xmm6, dst, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x73);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void psrad(Register dst, int imm8) {
+        assert isUByte(imm8) : "invalid value";
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        // XMM4 is for /2 encoding: 66 0F 72 /4 ib
+        int encode = simdPrefixAndEncode(AMD64.xmm4, dst, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x72);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void psrld(Register dst, int imm8) {
+        assert isUByte(imm8) : "invalid value";
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        // XMM2 is for /2 encoding: 66 0F 72 /2 ib
+        int encode = simdPrefixAndEncode(AMD64.xmm2, dst, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x72);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void psrlq(Register dst, int imm8) {
+        assert isUByte(imm8) : "invalid value";
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+        int encode = simdPrefixAndEncode(AMD64.xmm2, dst, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x73);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void pshufd(Register dst, Register src, int imm8) {
+        assert isUByte(imm8) : "invalid value";
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x70);
+        emitByte(0xC0 | encode);
+        emitByte(imm8);
+    }
+
+    public final void psubd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xFA);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void rcpps(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ true, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_NONE, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x53);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void ret(int imm16) {
+        if (imm16 == 0) {
+            emitByte(0xC3);
+        } else {
+            emitByte(0xC2);
+            emitShort(imm16);
+        }
+    }
+
+    public final void sarl(Register dst, int imm8) {
+        int encode = prefixAndEncode(dst.encoding);
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xF8 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xF8 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void shll(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xE0 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xE0 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void shll(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE0 | encode);
+    }
+
+    public final void shrl(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xC1);
+        emitByte(0xE8 | encode);
+        emitByte(imm8);
+    }
+
+    public final void shrl(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE8 | encode);
+    }
+
+    public final void subl(AMD64Address dst, int imm32) {
+        SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void subl(Register dst, int imm32) {
+        SUB.getMIOpcode(DWORD, isByte(imm32)).emit(this, DWORD, dst, imm32);
+    }
+
+    public final void subl(Register dst, Register src) {
+        SUB.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void subpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x5C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void subsd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x5C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void subsd(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x5C);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void testl(Register dst, int imm32) {
+        // not using emitArith because test
+        // doesn't support sign-extension of
+        // 8bit operands
+        int encode = dst.encoding;
+        if (encode == 0) {
+            emitByte(0xA9);
+        } else {
+            encode = prefixAndEncode(encode);
+            emitByte(0xF7);
+            emitByte(0xC0 | encode);
+        }
+        emitInt(imm32);
+    }
+
+    public final void testl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x85);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void testl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x85);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void unpckhpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x15);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void unpcklpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x14);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void xorl(Register dst, Register src) {
+        XOR.rmOp.emit(this, DWORD, dst, src);
+    }
+
+    public final void xorpd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x57);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void xorps(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_NONE, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x57);
+        emitByte(0xC0 | encode);
+    }
+
+    protected final void decl(Register dst) {
+        // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC8 | encode);
+    }
+
+    protected final void incl(Register dst) {
+        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC0 | encode);
+    }
+
+    private int prefixAndEncode(int regEnc) {
+        return prefixAndEncode(regEnc, false);
+    }
+
+    private int prefixAndEncode(int regEnc, boolean byteinst) {
+        if (regEnc >= 8) {
+            emitByte(Prefix.REXB);
+            return regEnc - 8;
+        } else if (byteinst && regEnc >= 4) {
+            emitByte(Prefix.REX);
+        }
+        return regEnc;
+    }
+
+    private int prefixqAndEncode(int regEnc) {
+        if (regEnc < 8) {
+            emitByte(Prefix.REXW);
+            return regEnc;
+        } else {
+            emitByte(Prefix.REXWB);
+            return regEnc - 8;
+        }
+    }
+
+    private int prefixAndEncode(int dstEnc, int srcEnc) {
+        return prefixAndEncode(dstEnc, false, srcEnc, false);
+    }
+
+    private int prefixAndEncode(int dstEncoding, boolean dstIsByte, int srcEncoding, boolean srcIsByte) {
+        int srcEnc = srcEncoding;
+        int dstEnc = dstEncoding;
+        if (dstEnc < 8) {
+            if (srcEnc >= 8) {
+                emitByte(Prefix.REXB);
+                srcEnc -= 8;
+            } else if ((srcIsByte && srcEnc >= 4) || (dstIsByte && dstEnc >= 4)) {
+                emitByte(Prefix.REX);
+            }
+        } else {
+            if (srcEnc < 8) {
+                emitByte(Prefix.REXR);
+            } else {
+                emitByte(Prefix.REXRB);
+                srcEnc -= 8;
+            }
+            dstEnc -= 8;
+        }
+        return dstEnc << 3 | srcEnc;
+    }
+
+    /**
+     * Creates prefix and the encoding of the lower 6 bits of the ModRM-Byte. It emits an operand
+     * prefix. If the given operands exceed 3 bits, the 4th bit is encoded in the prefix.
+     *
+     * @param regEncoding the encoding of the register part of the ModRM-Byte
+     * @param rmEncoding the encoding of the r/m part of the ModRM-Byte
+     * @return the lower 6 bits of the ModRM-Byte that should be emitted
+     */
+    private int prefixqAndEncode(int regEncoding, int rmEncoding) {
+        int rmEnc = rmEncoding;
+        int regEnc = regEncoding;
+        if (regEnc < 8) {
+            if (rmEnc < 8) {
+                emitByte(Prefix.REXW);
+            } else {
+                emitByte(Prefix.REXWB);
+                rmEnc -= 8;
+            }
+        } else {
+            if (rmEnc < 8) {
+                emitByte(Prefix.REXWR);
+            } else {
+                emitByte(Prefix.REXWRB);
+                rmEnc -= 8;
+            }
+            regEnc -= 8;
+        }
+        return regEnc << 3 | rmEnc;
+    }
+
+    private void vexPrefix(int rxb, int ndsEncoding, int pre, int opc, AMD64InstructionAttr attributes) {
+        int vectorLen = attributes.getVectorLen();
+        boolean vexW = attributes.isRexVexW();
+        boolean isXorB = ((rxb & 0x3) > 0);
+        if (isXorB || vexW || (opc == VexOpcode.VEX_OPCODE_0F_38) || (opc == VexOpcode.VEX_OPCODE_0F_3A)) {
+            emitByte(Prefix.VEX_3BYTES);
+
+            int byte1 = (rxb << 5);
+            byte1 = ((~byte1) & 0xE0) | opc;
+            emitByte(byte1);
+
+            int byte2 = ((~ndsEncoding) & 0xf) << 3;
+            byte2 |= (vexW ? VexPrefix.VEX_W : 0) | ((vectorLen > 0) ? 4 : 0) | pre;
+            emitByte(byte2);
+        } else {
+            emitByte(Prefix.VEX_2BYTES);
+
+            int byte1 = ((rxb & 0x4) > 0) ? VexPrefix.VEX_R : 0;
+            byte1 = (~byte1) & 0x80;
+            byte1 |= ((~ndsEncoding) & 0xf) << 3;
+            byte1 |= ((vectorLen > 0) ? 4 : 0) | pre;
+            emitByte(byte1);
+        }
+    }
+
+    private void vexPrefix(AMD64Address adr, Register nds, Register src, int pre, int opc, AMD64InstructionAttr attributes) {
+        int rxb = getRXB(src, adr);
+        int ndsEncoding = nds.isValid() ? nds.encoding : 0;
+        vexPrefix(rxb, ndsEncoding, pre, opc, attributes);
+        setCurAttributes(attributes);
+    }
+
+    private int vexPrefixAndEncode(Register dst, Register nds, Register src, int pre, int opc, AMD64InstructionAttr attributes) {
+        int rxb = getRXB(dst, src);
+        int ndsEncoding = nds.isValid() ? nds.encoding : 0;
+        vexPrefix(rxb, ndsEncoding, pre, opc, attributes);
+        // return modrm byte components for operands
+        return (((dst.encoding & 7) << 3) | (src.encoding & 7));
+    }
+
+    private void simdPrefix(Register xreg, Register nds, AMD64Address adr, int pre, int opc, AMD64InstructionAttr attributes) {
+        if (supports(CPUFeature.AVX)) {
+            vexPrefix(adr, nds, xreg, pre, opc, attributes);
+        } else {
+            switch (pre) {
+                case VexSimdPrefix.VEX_SIMD_66:
+                    emitByte(0x66);
+                    break;
+                case VexSimdPrefix.VEX_SIMD_F2:
+                    emitByte(0xF2);
+                    break;
+                case VexSimdPrefix.VEX_SIMD_F3:
+                    emitByte(0xF3);
+                    break;
+            }
+            if (attributes.isRexVexW()) {
+                prefixq(adr, xreg);
+            } else {
+                prefix(adr, xreg);
+            }
+            switch (opc) {
+                case VexOpcode.VEX_OPCODE_0F:
+                    emitByte(0x0F);
+                    break;
+                case VexOpcode.VEX_OPCODE_0F_38:
+                    emitByte(0x0F);
+                    emitByte(0x38);
+                    break;
+                case VexOpcode.VEX_OPCODE_0F_3A:
+                    emitByte(0x0F);
+                    emitByte(0x3A);
+                    break;
+            }
+        }
+    }
+
+    private int simdPrefixAndEncode(Register dst, Register nds, Register src, int pre, int opc, AMD64InstructionAttr attributes) {
+        if (supports(CPUFeature.AVX)) {
+            return vexPrefixAndEncode(dst, nds, src, pre, opc, attributes);
+        } else {
+            switch (pre) {
+                case VexSimdPrefix.VEX_SIMD_66:
+                    emitByte(0x66);
+                    break;
+                case VexSimdPrefix.VEX_SIMD_F2:
+                    emitByte(0xF2);
+                    break;
+                case VexSimdPrefix.VEX_SIMD_F3:
+                    emitByte(0xF3);
+                    break;
+            }
+            int encode;
+            int dstEncoding = dst.encoding;
+            int srcEncoding = src.encoding;
+            if (attributes.isRexVexW()) {
+                encode = prefixqAndEncode(dstEncoding, srcEncoding);
+            } else {
+                encode = prefixAndEncode(dstEncoding, srcEncoding);
+            }
+            switch (opc) {
+                case VexOpcode.VEX_OPCODE_0F:
+                    emitByte(0x0F);
+                    break;
+                case VexOpcode.VEX_OPCODE_0F_38:
+                    emitByte(0x0F);
+                    emitByte(0x38);
+                    break;
+                case VexOpcode.VEX_OPCODE_0F_3A:
+                    emitByte(0x0F);
+                    emitByte(0x3A);
+                    break;
+            }
+            return encode;
+        }
+    }
+
+    private static boolean needsRex(Register reg) {
+        return reg.encoding >= MinEncodingNeedsRex;
+    }
+
+    private void prefix(AMD64Address adr) {
+        if (needsRex(adr.getBase())) {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXXB);
+            } else {
+                emitByte(Prefix.REXB);
+            }
+        } else {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXX);
+            }
+        }
+    }
+
+    private void prefixq(AMD64Address adr) {
+        if (needsRex(adr.getBase())) {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXWXB);
+            } else {
+                emitByte(Prefix.REXWB);
+            }
+        } else {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXWX);
+            } else {
+                emitByte(Prefix.REXW);
+            }
+        }
+    }
+
+    private void prefix(AMD64Address adr, Register reg) {
+        prefix(adr, reg, false);
+    }
+
+    private void prefix(AMD64Address adr, Register reg, boolean byteinst) {
+        if (reg.encoding < 8) {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXXB);
+                } else {
+                    emitByte(Prefix.REXB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXX);
+                } else if (byteinst && reg.encoding >= 4) {
+                    emitByte(Prefix.REX);
+                }
+            }
+        } else {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXRXB);
+                } else {
+                    emitByte(Prefix.REXRB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXRX);
+                } else {
+                    emitByte(Prefix.REXR);
+                }
+            }
+        }
+    }
+
+    private void prefixq(AMD64Address adr, Register src) {
+        if (src.encoding < 8) {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWXB);
+                } else {
+                    emitByte(Prefix.REXWB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWX);
+                } else {
+                    emitByte(Prefix.REXW);
+                }
+            }
+        } else {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWRXB);
+                } else {
+                    emitByte(Prefix.REXWRB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWRX);
+                } else {
+                    emitByte(Prefix.REXWR);
+                }
+            }
+        }
+    }
+
+    public final void addq(Register dst, int imm32) {
+        ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void addq(AMD64Address dst, int imm32) {
+        ADD.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void addq(Register dst, Register src) {
+        ADD.rmOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void addq(AMD64Address dst, Register src) {
+        ADD.mrOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void andq(Register dst, int imm32) {
+        AND.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void bsrq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding(), src.encoding());
+        emitByte(0x0F);
+        emitByte(0xBD);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void bswapq(Register reg) {
+        int encode = prefixqAndEncode(reg.encoding);
+        emitByte(0x0F);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void cdqq() {
+        emitByte(Prefix.REXW);
+        emitByte(0x99);
+    }
+
+    public final void cmovq(ConditionFlag cc, Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.getValue());
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cmovq(ConditionFlag cc, Register dst, AMD64Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.getValue());
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void cmpq(Register dst, int imm32) {
+        CMP.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void cmpq(Register dst, Register src) {
+        CMP.rmOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void cmpq(Register dst, AMD64Address src) {
+        CMP.rmOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void cmpxchgq(Register reg, AMD64Address adr) {
+        prefixq(adr, reg);
+        emitByte(0x0F);
+        emitByte(0xB1);
+        emitOperandHelper(reg, adr, 0);
+    }
+
+    public final void cvtdq2pd(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xE6);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtsi2sdq(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.CPU);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, dst, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x2A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttsd2siq(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.CPU) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x2C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttpd2dq(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0xE6);
+        emitByte(0xC0 | encode);
+    }
+
+    protected final void decq(Register dst) {
+        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void decq(AMD64Address dst) {
+        DEC.emit(this, QWORD, dst);
+    }
+
+    public final void imulq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xAF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void incq(Register dst) {
+        // Don't use it directly. Use Macroincrementq() instead.
+        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void incq(AMD64Address dst) {
+        INC.emit(this, QWORD, dst);
+    }
+
+    public final void movq(Register dst, long imm64) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xB8 | encode);
+        emitLong(imm64);
+    }
+
+    public final void movslq(Register dst, int imm32) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xC7);
+        emitByte(0xC0 | encode);
+        emitInt(imm32);
+    }
+
+    public final void movdq(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x6E);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movdq(AMD64Address dst, Register src) {
+        assert src.getRegisterCategory().equals(AMD64.XMM);
+        // swap src/dst to get correct prefix
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x7E);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void movdq(Register dst, Register src) {
+        if (dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.CPU)) {
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+            int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+            emitByte(0x6E);
+            emitByte(0xC0 | encode);
+        } else if (src.getRegisterCategory().equals(AMD64.XMM) && dst.getRegisterCategory().equals(AMD64.CPU)) {
+            // swap src/dst to get correct prefix
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ true, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+            int encode = simdPrefixAndEncode(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+            emitByte(0x7E);
+            emitByte(0xC0 | encode);
+        } else {
+            throw new InternalError("should not reach here");
+        }
+    }
+
+    public final void movdl(Register dst, Register src) {
+        if (dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.CPU)) {
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+            int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+            emitByte(0x6E);
+            emitByte(0xC0 | encode);
+        } else if (src.getRegisterCategory().equals(AMD64.XMM) && dst.getRegisterCategory().equals(AMD64.CPU)) {
+            // swap src/dst to get correct prefix
+            AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+            int encode = simdPrefixAndEncode(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_66, VexOpcode.VEX_OPCODE_0F, attributes);
+            emitByte(0x7E);
+            emitByte(0xC0 | encode);
+        } else {
+            throw new InternalError("should not reach here");
+        }
+    }
+
+    public final void movddup(Register dst, Register src) {
+        assert supports(CPUFeature.SSE3);
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F2, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x12);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movdqu(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        simdPrefix(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x6F);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movdqu(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        int encode = simdPrefixAndEncode(dst, Register.None, src, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x6F);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void vmovdqu(Register dst, AMD64Address src) {
+        assert supports(CPUFeature.AVX);
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_256bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        vexPrefix(src, Register.None, dst, VexSimdPrefix.VEX_SIMD_F3, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x6F);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void vzeroupper() {
+        assert supports(CPUFeature.AVX);
+        AMD64InstructionAttr attributes = new AMD64InstructionAttr(AvxVectorLen.AVX_128bit, /* rexVexW */ false, /* legacyMode */ false, /* noMaskReg */ false, /* usesVl */ false, target);
+        vexPrefixAndEncode(AMD64.xmm0, AMD64.xmm0, AMD64.xmm0, VexSimdPrefix.VEX_SIMD_NONE, VexOpcode.VEX_OPCODE_0F, attributes);
+        emitByte(0x77);
+    }
+
+    public final void movslq(AMD64Address dst, int imm32) {
+        prefixq(dst);
+        emitByte(0xC7);
+        emitOperandHelper(0, dst, 4);
+        emitInt(imm32);
+    }
+
+    public final void movslq(Register dst, AMD64Address src) {
+        prefixq(src, dst);
+        emitByte(0x63);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void movslq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x63);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void negq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xF7);
+        emitByte(0xD8 | encode);
+    }
+
+    public final void orq(Register dst, Register src) {
+        OR.rmOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void shlq(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixqAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xE0 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xE0 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void shlq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE0 | encode);
+    }
+
+    public final void shrq(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixqAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xE8 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xE8 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void shrq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE8 | encode);
+    }
+
+    public final void sbbq(Register dst, Register src) {
+        SBB.rmOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void subq(Register dst, int imm32) {
+        SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void subq(AMD64Address dst, int imm32) {
+        SUB.getMIOpcode(QWORD, isByte(imm32)).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void subqWide(Register dst, int imm32) {
+        // don't use the sign-extending version, forcing a 32-bit immediate
+        SUB.getMIOpcode(QWORD, false).emit(this, QWORD, dst, imm32);
+    }
+
+    public final void subq(Register dst, Register src) {
+        SUB.rmOp.emit(this, QWORD, dst, src);
+    }
+
+    public final void testq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x85);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void xaddl(AMD64Address dst, Register src) {
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0xC1);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void xaddq(AMD64Address dst, Register src) {
+        prefixq(dst, src);
+        emitByte(0x0F);
+        emitByte(0xC1);
+        emitOperandHelper(src, dst, 0);
+    }
+
+    public final void xchgl(Register dst, AMD64Address src) {
+        prefix(src, dst);
+        emitByte(0x87);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void xchgq(Register dst, AMD64Address src) {
+        prefixq(src, dst);
+        emitByte(0x87);
+        emitOperandHelper(dst, src, 0);
+    }
+
+    public final void membar(int barriers) {
+        if (target.isMP) {
+            // We only have to handle StoreLoad
+            if ((barriers & STORE_LOAD) != 0) {
+                // All usable chips support "locked" instructions which suffice
+                // as barriers, and are much faster than the alternative of
+                // using cpuid instruction. We use here a locked add [rsp],0.
+                // This is conveniently otherwise a no-op except for blowing
+                // flags.
+                // Any change to this code may need to revisit other places in
+                // the code where this idiom is used, in particular the
+                // orderAccess code.
+                lock();
+                addl(new AMD64Address(rsp, 0), 0); // Assert the lock# signal here
+            }
+        }
+    }
+
+    @Override
+    protected final void patchJumpTarget(int branch, int branchTarget) {
+        int op = getByte(branch);
+        assert op == 0xE8 // call
+                        ||
+                        op == 0x00 // jump table entry
+                        || op == 0xE9 // jmp
+                        || op == 0xEB // short jmp
+                        || (op & 0xF0) == 0x70 // short jcc
+                        || op == 0x0F && (getByte(branch + 1) & 0xF0) == 0x80 // jcc
+        : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
+
+        if (op == 0x00) {
+            int offsetToJumpTableBase = getShort(branch + 1);
+            int jumpTableBase = branch - offsetToJumpTableBase;
+            int imm32 = branchTarget - jumpTableBase;
+            emitInt(imm32, branch);
+        } else if (op == 0xEB || (op & 0xF0) == 0x70) {
+
+            // short offset operators (jmp and jcc)
+            final int imm8 = branchTarget - (branch + 2);
+            /*
+             * Since a wrongly patched short branch can potentially lead to working but really bad
+             * behaving code we should always fail with an exception instead of having an assert.
+             */
+            if (!NumUtil.isByte(imm8)) {
+                throw new InternalError("branch displacement out of range: " + imm8);
+            }
+            emitByte(imm8, branch + 1);
+
+        } else {
+
+            int off = 1;
+            if (op == 0x0F) {
+                off = 2;
+            }
+
+            int imm32 = branchTarget - (branch + 4 + off);
+            emitInt(imm32, branch + off);
+        }
+    }
+
+    public void nullCheck(AMD64Address address) {
+        testl(AMD64.rax, address);
+    }
+
+    @Override
+    public void align(int modulus) {
+        if (position() % modulus != 0) {
+            nop(modulus - (position() % modulus));
+        }
+    }
+
+    /**
+     * Emits a direct call instruction. Note that the actual call target is not specified, because
+     * all calls need patching anyway. Therefore, 0 is emitted as the call target, and the user is
+     * responsible to add the call address to the appropriate patching tables.
+     */
+    public final void call() {
+        if (codePatchingAnnotationConsumer != null) {
+            int pos = position();
+            codePatchingAnnotationConsumer.accept(new ImmediateOperandAnnotation(pos, pos + 1, 4, pos + 5));
+        }
+        emitByte(0xE8);
+        emitInt(0);
+    }
+
+    public final void call(Register src) {
+        int encode = prefixAndEncode(src.encoding);
+        emitByte(0xFF);
+        emitByte(0xD0 | encode);
+    }
+
+    public final void int3() {
+        emitByte(0xCC);
+    }
+
+    public final void pause() {
+        emitByte(0xF3);
+        emitByte(0x90);
+    }
+
+    private void emitx87(int b1, int b2, int i) {
+        assert 0 <= i && i < 8 : "illegal stack offset";
+        emitByte(b1);
+        emitByte(b2 + i);
+    }
+
+    public final void fldd(AMD64Address src) {
+        emitByte(0xDD);
+        emitOperandHelper(0, src, 0);
+    }
+
+    public final void flds(AMD64Address src) {
+        emitByte(0xD9);
+        emitOperandHelper(0, src, 0);
+    }
+
+    public final void fldln2() {
+        emitByte(0xD9);
+        emitByte(0xED);
+    }
+
+    public final void fldlg2() {
+        emitByte(0xD9);
+        emitByte(0xEC);
+    }
+
+    public final void fyl2x() {
+        emitByte(0xD9);
+        emitByte(0xF1);
+    }
+
+    public final void fstps(AMD64Address src) {
+        emitByte(0xD9);
+        emitOperandHelper(3, src, 0);
+    }
+
+    public final void fstpd(AMD64Address src) {
+        emitByte(0xDD);
+        emitOperandHelper(3, src, 0);
+    }
+
+    private void emitFPUArith(int b1, int b2, int i) {
+        assert 0 <= i && i < 8 : "illegal FPU register: " + i;
+        emitByte(b1);
+        emitByte(b2 + i);
+    }
+
+    public void ffree(int i) {
+        emitFPUArith(0xDD, 0xC0, i);
+    }
+
+    public void fincstp() {
+        emitByte(0xD9);
+        emitByte(0xF7);
+    }
+
+    public void fxch(int i) {
+        emitFPUArith(0xD9, 0xC8, i);
+    }
+
+    public void fnstswAX() {
+        emitByte(0xDF);
+        emitByte(0xE0);
+    }
+
+    public void fwait() {
+        emitByte(0x9B);
+    }
+
+    public void fprem() {
+        emitByte(0xD9);
+        emitByte(0xF8);
+    }
+
+    public final void fsin() {
+        emitByte(0xD9);
+        emitByte(0xFE);
+    }
+
+    public final void fcos() {
+        emitByte(0xD9);
+        emitByte(0xFF);
+    }
+
+    public final void fptan() {
+        emitByte(0xD9);
+        emitByte(0xF2);
+    }
+
+    public final void fstp(int i) {
+        emitx87(0xDD, 0xD8, i);
+    }
+
+    @Override
+    public AMD64Address makeAddress(Register base, int displacement) {
+        return new AMD64Address(base, displacement);
+    }
+
+    @Override
+    public AMD64Address getPlaceholder(int instructionStartPosition) {
+        return new AMD64Address(rip, Register.None, Scale.Times1, 0, instructionStartPosition);
+    }
+
+    private void prefetchPrefix(AMD64Address src) {
+        prefix(src);
+        emitByte(0x0F);
+    }
+
+    public void prefetchnta(AMD64Address src) {
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(0, src, 0);
+    }
+
+    void prefetchr(AMD64Address src) {
+        assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
+        prefetchPrefix(src);
+        emitByte(0x0D);
+        emitOperandHelper(0, src, 0);
+    }
+
+    public void prefetcht0(AMD64Address src) {
+        assert supports(CPUFeature.SSE);
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(1, src, 0);
+    }
+
+    public void prefetcht1(AMD64Address src) {
+        assert supports(CPUFeature.SSE);
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(2, src, 0);
+    }
+
+    public void prefetcht2(AMD64Address src) {
+        assert supports(CPUFeature.SSE);
+        prefix(src);
+        emitByte(0x0f);
+        emitByte(0x18);
+        emitOperandHelper(3, src, 0);
+    }
+
+    public void prefetchw(AMD64Address src) {
+        assert supports(CPUFeature.AMD_3DNOW_PREFETCH);
+        prefix(src);
+        emitByte(0x0f);
+        emitByte(0x0D);
+        emitOperandHelper(1, src, 0);
+    }
+
+    public void rdtsc() {
+        emitByte(0x0F);
+        emitByte(0x31);
+    }
+
+    /**
+     * Emits an instruction which is considered to be illegal. This is used if we deliberately want
+     * to crash the program (debugging etc.).
+     */
+    public void illegal() {
+        emitByte(0x0f);
+        emitByte(0x0b);
+    }
+}