< prev index next >

src/cpu/x86/vm/assembler_x86.hpp

Print this page

        

*** 434,444 **** AddressLiteral base() { return _base; } Address index() { return _index; } }; ! const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize); // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write // is what you get. The Assembler is generating code into a CodeBuffer. --- 434,444 ---- AddressLiteral base() { return _base; } Address index() { return _index; } }; ! const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize); // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write // is what you get. The Assembler is generating code into a CodeBuffer.
*** 499,518 **** REX_WRB = 0x4D, REX_WRX = 0x4E, REX_WRXB = 0x4F, VEX_3bytes = 0xC4, ! VEX_2bytes = 0xC5 }; enum VexPrefix { VEX_B = 0x20, VEX_X = 0x40, VEX_R = 0x80, VEX_W = 0x80 }; enum VexSimdPrefix { VEX_SIMD_NONE = 0x0, VEX_SIMD_66 = 0x1, VEX_SIMD_F3 = 0x2, VEX_SIMD_F2 = 0x3 --- 499,527 ---- REX_WRB = 0x4D, REX_WRX = 0x4E, REX_WRXB = 0x4F, VEX_3bytes = 0xC4, ! VEX_2bytes = 0xC5, ! EVEX_4bytes = 0x62 }; enum VexPrefix { VEX_B = 0x20, VEX_X = 0x40, VEX_R = 0x80, VEX_W = 0x80 }; + enum ExexPrefix { + EVEX_F = 0x04, + EVEX_V = 0x08, + EVEX_Rb = 0x10, + EVEX_X = 0x40, + EVEX_Z = 0x80 + }; + enum VexSimdPrefix { VEX_SIMD_NONE = 0x0, VEX_SIMD_66 = 0x1, VEX_SIMD_F3 = 0x2, VEX_SIMD_F2 = 0x3
*** 523,532 **** --- 532,572 ---- VEX_OPCODE_0F = 0x1, VEX_OPCODE_0F_38 = 0x2, VEX_OPCODE_0F_3A = 0x3 }; + enum AvxVectorLen { + AVX_128bit = 0x0, + AVX_256bit = 0x1, + AVX_512bit = 0x2, + AVX_NoVec = 0x4 + }; + + enum EvexTupleType { + EVEX_FV = 0, + EVEX_HV = 4, + EVEX_FVM = 6, + EVEX_T1S = 7, + EVEX_T1F = 11, + EVEX_T2 = 13, + EVEX_T4 = 15, + EVEX_T8 = 17, + EVEX_HVM = 18, + EVEX_QVM = 19, + EVEX_OVM = 20, + EVEX_M128 = 21, + EVEX_DUP = 22, + EVEX_ETUP = 23 + }; + + enum EvexInputSizeInBits { + EVEX_8bit = 0, + EVEX_16bit = 1, + EVEX_32bit = 2, + EVEX_64bit = 3 + }; + enum WhichOperand { // input to locate_operand, and format code for relocations imm_operand = 0, // embedded 32-bit|64-bit immediate operand disp32_operand = 1, // embedded 32-bit displacement or address call32_operand = 2, // embedded 32-bit self-relative displacement
*** 550,559 **** --- 590,604 ---- // This does mean you'll get a linker/runtime error if you use a 64bit only instruction // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down. private: + int evex_encoding; + int input_size_in_bits; + int avx_vector_len; + int tuple_type; + bool is_evex_instruction; // 64bit prefixes int prefix_and_encode(int reg_enc, bool byteinst = false); int prefixq_and_encode(int reg_enc);
*** 576,705 **** int rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w); void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, ! bool vector256); void vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, ! bool vex_w, bool vector256); void vex_prefix(XMMRegister dst, XMMRegister nds, Address src, ! VexSimdPrefix pre, bool vector256 = false) { int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; ! vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256); } ! void vex_prefix_0F38(Register dst, Register nds, Address src) { bool vex_w = false; ! bool vector256 = false; vex_prefix(src, nds->encoding(), dst->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); } ! void vex_prefix_0F38_q(Register dst, Register nds, Address src) { bool vex_w = true; ! bool vector256 = false; vex_prefix(src, nds->encoding(), dst->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); } int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, ! bool vex_w, bool vector256); ! int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) { bool vex_w = false; ! bool vector256 = false; return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); } ! int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) { bool vex_w = true; ! bool vector256 = false; return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); } int vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, ! VexSimdPrefix pre, bool vector256 = false, ! VexOpcode opc = VEX_OPCODE_0F) { int src_enc = src->encoding(); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; ! return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256); } void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, ! bool rex_w = false, bool vector256 = false); ! void simd_prefix(XMMRegister dst, Address src, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { ! simd_prefix(dst, xnoreg, src, pre, opc); } ! void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) { ! simd_prefix(src, dst, pre); } void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src, ! VexSimdPrefix pre) { bool rex_w = true; ! simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w); } int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, ! bool rex_w = false, bool vector256 = false); // Move/convert 32-bit integer value. int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src, ! VexSimdPrefix pre) { // It is OK to cast from Register to XMMRegister to pass argument here // since only encoding is used in simd_prefix_and_encode() and number of // Gen and Xmm registers are the same. ! return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre); } ! int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) { ! return simd_prefix_and_encode(dst, xnoreg, src, pre); } int simd_prefix_and_encode(Register dst, XMMRegister src, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { ! return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc); } // Move/convert 64-bit integer value. int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src, ! VexSimdPrefix pre) { bool rex_w = true; ! return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w); } ! int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) { ! return simd_prefix_and_encode_q(dst, xnoreg, src, pre); } int simd_prefix_and_encode_q(Register dst, XMMRegister src, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { bool rex_w = true; ! return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w); } // Helper functions for groups of instructions void emit_arith_b(int op1, int op2, Register dst, int imm8); void emit_arith(int op1, int op2, Register dst, int32_t imm32); // Force generation of a 4 byte immediate value even if it fits into 8bit void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32); void emit_arith(int op1, int op2, Register dst, Register src); ! void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre); ! void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre); ! void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre); ! void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre); void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, ! Address src, VexSimdPrefix pre, bool vector256); void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, ! XMMRegister src, VexSimdPrefix pre, bool vector256); void emit_operand(Register reg, Register base, Register index, Address::ScaleFactor scale, int disp, RelocationHolder const& rspec, --- 621,799 ---- int rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w); void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, ! int vector_len); ! ! void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v, ! int nds_enc, VexSimdPrefix pre, VexOpcode opc, ! bool is_extended_context, bool is_merge_context, ! int vector_len, bool no_mask_reg ); void vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, ! bool vex_w, int vector_len, ! bool legacy_mode = false, bool no_mask_reg = false); void vex_prefix(XMMRegister dst, XMMRegister nds, Address src, ! VexSimdPrefix pre, int vector_len = AVX_128bit, ! bool no_mask_reg = false, bool legacy_mode = false) { ! int dst_enc = dst->encoding(); ! int nds_enc = nds->is_valid() ? nds->encoding() : 0; ! vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg); ! } ! ! void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src, ! VexSimdPrefix pre, int vector_len = AVX_128bit, ! bool no_mask_reg = false) { int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; ! vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg); } ! void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) { bool vex_w = false; ! int vector_len = AVX_128bit; vex_prefix(src, nds->encoding(), dst->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, ! vector_len, no_mask_reg); } ! void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) { bool vex_w = true; ! int vector_len = AVX_128bit; vex_prefix(src, nds->encoding(), dst->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, ! vector_len, no_mask_reg); } int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, ! bool vex_w, int vector_len, ! bool legacy_mode, bool no_mask_reg); ! int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) { bool vex_w = false; ! int vector_len = AVX_128bit; return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len, ! false, no_mask_reg); } ! int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) { bool vex_w = true; ! int vector_len = AVX_128bit; return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), ! VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len, ! false, no_mask_reg); } int vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, ! VexSimdPrefix pre, int vector_len = AVX_128bit, ! VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false, ! bool no_mask_reg = false) { int src_enc = src->encoding(); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; ! return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg); } void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, ! VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F, ! bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false); ! void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre, ! bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) { ! simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc); } ! void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) { ! simd_prefix(src, dst, pre, no_mask_reg); } void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src, ! VexSimdPrefix pre, bool no_mask_reg = false) { bool rex_w = true; ! simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w); } int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, ! VexSimdPrefix pre, bool no_mask_reg, ! VexOpcode opc = VEX_OPCODE_0F, ! bool rex_w = false, int vector_len = AVX_128bit, ! bool legacy_mode = false); ! ! int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, ! VexSimdPrefix pre, bool no_mask_reg, ! VexOpcode opc = VEX_OPCODE_0F, ! bool rex_w = false, int vector_len = AVX_128bit); ! ! int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, ! VexSimdPrefix pre, bool no_mask_reg, ! VexOpcode opc = VEX_OPCODE_0F, ! bool rex_w = false, int vector_len = AVX_128bit); // Move/convert 32-bit integer value. int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src, ! VexSimdPrefix pre, bool no_mask_reg) { // It is OK to cast from Register to XMMRegister to pass argument here // since only encoding is used in simd_prefix_and_encode() and number of // Gen and Xmm registers are the same. ! return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F); } ! int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) { ! return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg); } int simd_prefix_and_encode(Register dst, XMMRegister src, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, ! bool no_mask_reg = false) { ! return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc); } // Move/convert 64-bit integer value. int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src, ! VexSimdPrefix pre, bool no_mask_reg = false) { bool rex_w = true; ! return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w); } ! int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) { ! return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg); } int simd_prefix_and_encode_q(Register dst, XMMRegister src, ! VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, ! bool no_mask_reg = false) { bool rex_w = true; ! return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w); } // Helper functions for groups of instructions void emit_arith_b(int op1, int op2, Register dst, int imm8); void emit_arith(int op1, int op2, Register dst, int32_t imm32); // Force generation of a 4 byte immediate value even if it fits into 8bit void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32); void emit_arith(int op1, int op2, Register dst, Register src); ! void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false); ! void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false); ! void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false); ! void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false); ! void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false); ! void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false); ! void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false); ! void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false); void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, ! Address src, VexSimdPrefix pre, int vector_len, ! bool no_mask_reg = false, bool legacy_mode = false); ! void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, ! Address src, VexSimdPrefix pre, int vector_len, ! bool no_mask_reg = false); void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, ! XMMRegister src, VexSimdPrefix pre, int vector_len, ! bool no_mask_reg = false, bool legacy_mode = false); ! void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, ! XMMRegister src, VexSimdPrefix pre, int vector_len, ! bool no_mask_reg = false); ! ! bool emit_compressed_disp_byte(int &disp); void emit_operand(Register reg, Register base, Register index, Address::ScaleFactor scale, int disp, RelocationHolder const& rspec,
*** 821,843 **** void prefix(Prefix p); public: // Creation ! Assembler(CodeBuffer* code) : AbstractAssembler(code) {} // Decoding static address locate_operand(address inst, WhichOperand which); static address locate_next_instruction(address inst); // Utilities static bool is_polling_page_far() NOT_LP64({ return false;}); // Generic instructions // Does 32bit or 64bit as needed for the platform. In some sense these // belong in macro assembler but there is no need for both varieties to exist void lea(Register dst, Address src); void mov(Register dst, Register src); void pusha(); --- 915,949 ---- void prefix(Prefix p); public: // Creation ! Assembler(CodeBuffer* code) : AbstractAssembler(code) { ! init_attributes(); ! } // Decoding static address locate_operand(address inst, WhichOperand which); static address locate_next_instruction(address inst); // Utilities static bool is_polling_page_far() NOT_LP64({ return false;}); + static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len, + int cur_tuple_type, int in_size_in_bits, int cur_encoding); // Generic instructions // Does 32bit or 64bit as needed for the platform. In some sense these // belong in macro assembler but there is no need for both varieties to exist + void init_attributes(void) { + evex_encoding = 0; + input_size_in_bits = 0; + avx_vector_len = AVX_NoVec; + tuple_type = EVEX_ETUP; + is_evex_instruction = false; + } + void lea(Register dst, Address src); void mov(Register dst, Register src); void pusha();
*** 1334,1343 **** --- 1440,1455 ---- void movb(Address dst, Register src); void movb(Address dst, int imm8); void movb(Register dst, Address src); + void kmovq(KRegister dst, KRegister src); + void kmovql(KRegister dst, Register src); + void kmovdl(KRegister dst, Register src); + void kmovq(Address dst, KRegister src); + void kmovq(KRegister dst, Address src); + void movdl(XMMRegister dst, Register src); void movdl(Register dst, XMMRegister src); void movdl(XMMRegister dst, Address src); void movdl(Address dst, XMMRegister src);
*** 1357,1366 **** --- 1469,1483 ---- // Move Unaligned 256bit Vector void vmovdqu(Address dst, XMMRegister src); void vmovdqu(XMMRegister dst, Address src); void vmovdqu(XMMRegister dst, XMMRegister src); + // Move Unaligned 512bit Vector + void evmovdqu(Address dst, XMMRegister src, int vector_len); + void evmovdqu(XMMRegister dst, Address src, int vector_len); + void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len); + // Move lower 64bit to high 64bit in 128bit register void movlhps(XMMRegister dst, XMMRegister src); void movl(Register dst, int32_t imm32); void movl(Address dst, int32_t imm32);
*** 1482,1495 **** void orq(Register dst, Register src); // Pack with unsigned saturation void packuswb(XMMRegister dst, XMMRegister src); void packuswb(XMMRegister dst, Address src); ! void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); // Pemutation of 64bit words ! void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256); void pause(); // SSE4.2 string instructions void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8); --- 1599,1612 ---- void orq(Register dst, Register src); // Pack with unsigned saturation void packuswb(XMMRegister dst, XMMRegister src); void packuswb(XMMRegister dst, Address src); ! void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); // Pemutation of 64bit words ! void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void pause(); // SSE4.2 string instructions void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
*** 1730,1893 **** //====================VECTOR ARITHMETIC===================================== // Add Packed Floating-Point Values void addpd(XMMRegister dst, XMMRegister src); void addps(XMMRegister dst, XMMRegister src); ! void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Subtract Packed Floating-Point Values void subpd(XMMRegister dst, XMMRegister src); void subps(XMMRegister dst, XMMRegister src); ! void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Multiply Packed Floating-Point Values void mulpd(XMMRegister dst, XMMRegister src); void mulps(XMMRegister dst, XMMRegister src); ! void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Divide Packed Floating-Point Values void divpd(XMMRegister dst, XMMRegister src); void divps(XMMRegister dst, XMMRegister src); ! void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Bitwise Logical AND of Packed Floating-Point Values void andpd(XMMRegister dst, XMMRegister src); void andps(XMMRegister dst, XMMRegister src); ! void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Bitwise Logical XOR of Packed Floating-Point Values void xorpd(XMMRegister dst, XMMRegister src); void xorps(XMMRegister dst, XMMRegister src); ! void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Add horizontal packed integers ! void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); void phaddw(XMMRegister dst, XMMRegister src); void phaddd(XMMRegister dst, XMMRegister src); // Add packed integers void paddb(XMMRegister dst, XMMRegister src); void paddw(XMMRegister dst, XMMRegister src); void paddd(XMMRegister dst, XMMRegister src); void paddq(XMMRegister dst, XMMRegister src); ! void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Sub packed integers void psubb(XMMRegister dst, XMMRegister src); void psubw(XMMRegister dst, XMMRegister src); void psubd(XMMRegister dst, XMMRegister src); void psubq(XMMRegister dst, XMMRegister src); ! void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Multiply packed integers (only shorts and ints) void pmullw(XMMRegister dst, XMMRegister src); void pmulld(XMMRegister dst, XMMRegister src); ! void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); ! void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Shift left packed integers void psllw(XMMRegister dst, int shift); void pslld(XMMRegister dst, int shift); void psllq(XMMRegister dst, int shift); void psllw(XMMRegister dst, XMMRegister shift); void pslld(XMMRegister dst, XMMRegister shift); void psllq(XMMRegister dst, XMMRegister shift); ! void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); ! void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); ! void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); // Logical shift right packed integers void psrlw(XMMRegister dst, int shift); void psrld(XMMRegister dst, int shift); void psrlq(XMMRegister dst, int shift); void psrlw(XMMRegister dst, XMMRegister shift); void psrld(XMMRegister dst, XMMRegister shift); void psrlq(XMMRegister dst, XMMRegister shift); ! void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); ! void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); ! void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs) void psraw(XMMRegister dst, int shift); void psrad(XMMRegister dst, int shift); void psraw(XMMRegister dst, XMMRegister shift); void psrad(XMMRegister dst, XMMRegister shift); ! void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256); ! void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); ! void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); // And packed integers void pand(XMMRegister dst, XMMRegister src); ! void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Or packed integers void por(XMMRegister dst, XMMRegister src); ! void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); ! void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); ! void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256); // Copy low 128bit into high 128bit of YMM registers. void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vextractf128h(XMMRegister dst, XMMRegister src); // Load/store high 128bit of YMM registers which does not destroy other half. void vinsertf128h(XMMRegister dst, Address src); void vinserti128h(XMMRegister dst, Address src); void vextractf128h(Address dst, XMMRegister src); void vextracti128h(Address dst, XMMRegister src); // duplicate 4-bytes integer data from src into 8 locations in dest void vpbroadcastd(XMMRegister dst, XMMRegister src); // Carry-Less Multiplication Quadword void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask); // AVX instruction which is used to clear upper 128 bits of YMM registers and --- 1847,2029 ---- //====================VECTOR ARITHMETIC===================================== // Add Packed Floating-Point Values void addpd(XMMRegister dst, XMMRegister src); void addps(XMMRegister dst, XMMRegister src); ! void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Subtract Packed Floating-Point Values void subpd(XMMRegister dst, XMMRegister src); void subps(XMMRegister dst, XMMRegister src); ! void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Multiply Packed Floating-Point Values void mulpd(XMMRegister dst, XMMRegister src); void mulps(XMMRegister dst, XMMRegister src); ! void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Divide Packed Floating-Point Values void divpd(XMMRegister dst, XMMRegister src); void divps(XMMRegister dst, XMMRegister src); ! void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Bitwise Logical AND of Packed Floating-Point Values void andpd(XMMRegister dst, XMMRegister src); void andps(XMMRegister dst, XMMRegister src); ! void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Bitwise Logical XOR of Packed Floating-Point Values void xorpd(XMMRegister dst, XMMRegister src); void xorps(XMMRegister dst, XMMRegister src); ! void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Add horizontal packed integers ! void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void phaddw(XMMRegister dst, XMMRegister src); void phaddd(XMMRegister dst, XMMRegister src); // Add packed integers void paddb(XMMRegister dst, XMMRegister src); void paddw(XMMRegister dst, XMMRegister src); void paddd(XMMRegister dst, XMMRegister src); void paddq(XMMRegister dst, XMMRegister src); ! void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Sub packed integers void psubb(XMMRegister dst, XMMRegister src); void psubw(XMMRegister dst, XMMRegister src); void psubd(XMMRegister dst, XMMRegister src); void psubq(XMMRegister dst, XMMRegister src); ! void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Multiply packed integers (only shorts and ints) void pmullw(XMMRegister dst, XMMRegister src); void pmulld(XMMRegister dst, XMMRegister src); ! void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len); ! void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Shift left packed integers void psllw(XMMRegister dst, int shift); void pslld(XMMRegister dst, int shift); void psllq(XMMRegister dst, int shift); void psllw(XMMRegister dst, XMMRegister shift); void pslld(XMMRegister dst, XMMRegister shift); void psllq(XMMRegister dst, XMMRegister shift); ! void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); ! void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); ! void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // Logical shift right packed integers void psrlw(XMMRegister dst, int shift); void psrld(XMMRegister dst, int shift); void psrlq(XMMRegister dst, int shift); void psrlw(XMMRegister dst, XMMRegister shift); void psrld(XMMRegister dst, XMMRegister shift); void psrlq(XMMRegister dst, XMMRegister shift); ! void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); ! void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); ! void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs) void psraw(XMMRegister dst, int shift); void psrad(XMMRegister dst, int shift); void psraw(XMMRegister dst, XMMRegister shift); void psrad(XMMRegister dst, XMMRegister shift); ! void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); ! void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); ! void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // And packed integers void pand(XMMRegister dst, XMMRegister src); ! void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Or packed integers void por(XMMRegister dst, XMMRegister src); ! void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); ! void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); ! void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Copy low 128bit into high 128bit of YMM registers. void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vextractf128h(XMMRegister dst, XMMRegister src); + void vextracti128h(XMMRegister dst, XMMRegister src); // Load/store high 128bit of YMM registers which does not destroy other half. void vinsertf128h(XMMRegister dst, Address src); void vinserti128h(XMMRegister dst, Address src); void vextractf128h(Address dst, XMMRegister src); void vextracti128h(Address dst, XMMRegister src); + // Copy low 256bit into high 256bit of ZMM registers. + void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src); + void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src); + void vextracti64x4h(XMMRegister dst, XMMRegister src); + void vextractf64x4h(XMMRegister dst, XMMRegister src); + void vextractf64x4h(Address dst, XMMRegister src); + void vinsertf64x4h(XMMRegister dst, Address src); + + // Copy targeted 128bit segments of the ZMM registers + void vextracti64x2h(XMMRegister dst, XMMRegister src, int value); + void vextractf64x2h(XMMRegister dst, XMMRegister src, int value); + void vextractf32x4h(XMMRegister dst, XMMRegister src, int value); + // duplicate 4-bytes integer data from src into 8 locations in dest void vpbroadcastd(XMMRegister dst, XMMRegister src); + // duplicate 4-bytes integer data from src into vector_len locations in dest + void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len); + // Carry-Less Multiplication Quadword void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask); // AVX instruction which is used to clear upper 128 bits of YMM registers and
< prev index next >