--- old/src/cpu/x86/vm/assembler_x86.cpp 2015-04-23 08:25:11.149343300 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-04-23 08:25:10.933343300 -0700 @@ -54,6 +54,36 @@ #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") // Implementation of AddressLiteral +// A 2-D table for managing compressed displacement(disp8) on EVEX enabled platforms. +unsigned char tuple_table[Assembler::EVEX_ETUP + 1][Assembler::AVX_512bit + 1] = { + // -----------------Table 4.5 -------------------- // + 16, 32, 64, // EVEX_FV(0) + 4, 4, 4, // EVEX_FV(1) - with Evex.b + 16, 32, 64, // EVEX_FV(2) - with Evex.w + 8, 8, 8, // EVEX_FV(3) - with Evex.w and Evex.b + 8, 16, 32, // EVEX_HV(0) + 4, 4, 4, // EVEX_HV(1) - with Evex.b + // -----------------Table 4.6 -------------------- // + 16, 32, 64, // EVEX_FVM(0) + 1, 1, 1, // EVEX_T1S(0) + 2, 2, 2, // EVEX_T1S(1) + 4, 4, 4, // EVEX_T1S(2) + 8, 8, 8, // EVEX_T1S(3) + 4, 4, 4, // EVEX_T1F(0) + 8, 8, 8, // EVEX_T1F(1) + 8, 8, 8, // EVEX_T2(0) + 0, 16, 16, // EVEX_T2(1) + 0, 16, 16, // EVEX_T4(0) + 0, 0, 32, // EVEX_T4(1) + 0, 0, 32, // EVEX_T8(0) + 8, 16, 32, // EVEX_HVM(0) + 4, 8, 16, // EVEX_QVM(0) + 2, 4, 8, // EVEX_OVM(0) + 16, 16, 16, // EVEX_M128(0) + 8, 32, 64, // EVEX_DUP(0) + 0, 0, 0 // EVEX_NTUP +}; + AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) { _is_lval = false; _target = target; @@ -183,8 +213,9 @@ // make this go away someday void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) { if (rtype == relocInfo::none) - emit_int32(data); - else emit_data(data, Relocation::spec_simple(rtype), format); + emit_int32(data); + else + emit_data(data, Relocation::spec_simple(rtype), format); } void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) { @@ -273,6 +304,177 @@ } +bool Assembler::query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len, + int cur_tuple_type, int in_size_in_bits, int cur_encoding) { + int mod_idx = 0; + // We will test if the displacement fits the compressed format and if so + // apply the compression to the displacment iff the result is8bit. + if (VM_Version::supports_evex() && is_evex_inst) { + switch (cur_tuple_type) { + case EVEX_FV: + if ((cur_encoding & VEX_W) == VEX_W) { + mod_idx += 2 + ((cur_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + } else { + mod_idx = ((cur_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + } + break; + + case EVEX_HV: + mod_idx = ((cur_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + break; + + case EVEX_FVM: + break; + + case EVEX_T1S: + switch (in_size_in_bits) { + case EVEX_8bit: + break; + + case EVEX_16bit: + mod_idx = 1; + break; + + case EVEX_32bit: + mod_idx = 2; + break; + + case EVEX_64bit: + mod_idx = 3; + break; + } + break; + + case EVEX_T1F: + case EVEX_T2: + case EVEX_T4: + mod_idx = (in_size_in_bits == EVEX_64bit) ? 1 : 0; + break; + + case EVEX_T8: + break; + + case EVEX_HVM: + break; + + case EVEX_QVM: + break; + + case EVEX_OVM: + break; + + case EVEX_M128: + break; + + case EVEX_DUP: + break; + + default: + assert(0, "no valid evex tuple_table entry"); + break; + } + + if (vector_len >= AVX_128bit && vector_len <= AVX_512bit) { + int disp_factor = tuple_table[cur_tuple_type + mod_idx][vector_len]; + if ((disp % disp_factor) == 0) { + int new_disp = disp / disp_factor; + if ((-0x80 <= new_disp && new_disp < 0x80)) { + disp = new_disp; + } + } else { + return false; + } + } + } + return (-0x80 <= disp && disp < 0x80); +} + + +bool Assembler::emit_compressed_disp_byte(int &disp) { + int mod_idx = 0; + // We will test if the displacement fits the compressed format and if so + // apply the compression to the displacment iff the result is8bit. + if (VM_Version::supports_evex() && is_evex_instruction) { + switch (tuple_type) { + case EVEX_FV: + if ((evex_encoding & VEX_W) == VEX_W) { + mod_idx += 2 + ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + } else { + mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + } + break; + + case EVEX_HV: + mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + break; + + case EVEX_FVM: + break; + + case EVEX_T1S: + switch (input_size_in_bits) { + case EVEX_8bit: + break; + + case EVEX_16bit: + mod_idx = 1; + break; + + case EVEX_32bit: + mod_idx = 2; + break; + + case EVEX_64bit: + mod_idx = 3; + break; + } + break; + + case EVEX_T1F: + case EVEX_T2: + case EVEX_T4: + mod_idx = (input_size_in_bits == EVEX_64bit) ? 1 : 0; + break; + + case EVEX_T8: + break; + + case EVEX_HVM: + break; + + case EVEX_QVM: + break; + + case EVEX_OVM: + break; + + case EVEX_M128: + break; + + case EVEX_DUP: + break; + + default: + assert(0, "no valid evex tuple_table entry"); + break; + } + + if (avx_vector_len >= AVX_128bit && avx_vector_len <= AVX_512bit) { + int disp_factor = tuple_table[tuple_type + mod_idx][avx_vector_len]; + if ((disp % disp_factor) == 0) { + int new_disp = disp / disp_factor; + if (is8bit(new_disp)) { + disp = new_disp; + } + } else { + return false; + } + } + } + return is8bit(disp); +} + + void Assembler::emit_operand(Register reg, Register base, Register index, Address::ScaleFactor scale, int disp, RelocationHolder const& rspec, @@ -296,7 +498,7 @@ assert(index != rsp, "illegal addressing mode"); emit_int8(0x04 | regenc); emit_int8(scale << 6 | indexenc | baseenc); - } else if (is8bit(disp) && rtype == relocInfo::none) { + } else if (emit_compressed_disp_byte(disp) && rtype == relocInfo::none) { // [base + index*scale + imm8] // [01 reg 100][ss index base] imm8 assert(index != rsp, "illegal addressing mode"); @@ -318,7 +520,7 @@ // [00 reg 100][00 100 100] emit_int8(0x04 | regenc); emit_int8(0x24); - } else if (is8bit(disp) && rtype == relocInfo::none) { + } else if (emit_compressed_disp_byte(disp) && rtype == relocInfo::none) { // [rsp + imm8] // [01 reg 100][00 100 100] disp8 emit_int8(0x44 | regenc); @@ -339,7 +541,7 @@ // [base] // [00 reg base] emit_int8(0x00 | regenc | baseenc); - } else if (is8bit(disp) && rtype == relocInfo::none) { + } else if (emit_compressed_disp_byte(disp) && rtype == relocInfo::none) { // [base + disp8] // [01 reg base] disp8 emit_int8(0x40 | regenc | baseenc); @@ -389,11 +591,20 @@ emit_data(disp, rspec, disp32_operand); } } + is_evex_instruction = false; } void Assembler::emit_operand(XMMRegister reg, Register base, Register index, Address::ScaleFactor scale, int disp, RelocationHolder const& rspec) { + if (UseAVX > 2) { + int xreg_enc = reg->encoding(); + if (xreg_enc > 15) { + XMMRegister new_reg = as_XMMRegister(xreg_enc & 0xf); + emit_operand((Register)new_reg, base, index, scale, disp, rspec); + return; + } + } emit_operand((Register)reg, base, index, scale, disp, rspec); } @@ -686,6 +897,29 @@ debug_only(has_disp32 = true); // has both kinds of operands! break; + case 0x62: // EVEX_4bytes + assert((UseAVX > 0), "shouldn't have EVEX prefix"); + assert(ip == inst+1, "no prefixes allowed"); + // no EVEX collisions, all instructions that have 0x62 opcodes + // have EVEX versions and are subopcodes of 0x66 + ip++; // skip P0 and exmaine W in P1 + is_64bit = ((VEX_W & *ip) == VEX_W); + ip++; // move to P2 + ip++; // skip P2, move to opcode + // To find the end of instruction (which == end_pc_operand). + switch (0xFF & *ip) { + case 0x61: // pcmpestri r, r/a, #8 + case 0x70: // pshufd r, r/a, #8 + case 0x73: // psrldq r, #8 + tail_size = 1; // the imm8 + break; + default: + break; + } + ip++; // skip opcode + debug_only(has_disp32 = true); // has both kinds of operands! + break; + case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1 case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a @@ -985,12 +1219,22 @@ void Assembler::addsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); + } } void Assembler::addsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); + } } void Assembler::addss(XMMRegister dst, XMMRegister src) { @@ -1000,20 +1244,26 @@ void Assembler::addss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } emit_simd_arith(0x58, dst, src, VEX_SIMD_F3); } void Assembler::aesdec(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDE); emit_operand(dst, src); } void Assembler::aesdec(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDE); emit_int8(0xC0 | encode); } @@ -1021,14 +1271,16 @@ void Assembler::aesdeclast(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDF); emit_operand(dst, src); } void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDF); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1036,14 +1288,16 @@ void Assembler::aesenc(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDC); emit_operand(dst, src); } void Assembler::aesenc(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDC); emit_int8(0xC0 | encode); } @@ -1051,14 +1305,16 @@ void Assembler::aesenclast(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDD); emit_operand(dst, src); } void Assembler::aesenclast(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8((unsigned char)0xDD); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1091,7 +1347,7 @@ void Assembler::andnl(Register dst, Register src1, Register src2) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(dst, src1, src2); + int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false); emit_int8((unsigned char)0xF2); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1099,7 +1355,7 @@ void Assembler::andnl(Register dst, Register src1, Address src2) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(dst, src1, src2); + vex_prefix_0F38(dst, src1, src2, false); emit_int8((unsigned char)0xF2); emit_operand(dst, src2); } @@ -1126,7 +1382,7 @@ void Assembler::blsil(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(rbx, dst, src); + int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1134,14 +1390,14 @@ void Assembler::blsil(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(rbx, dst, src); + vex_prefix_0F38(rbx, dst, src, false); emit_int8((unsigned char)0xF3); emit_operand(rbx, src); } void Assembler::blsmskl(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(rdx, dst, src); + int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1149,14 +1405,14 @@ void Assembler::blsmskl(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(rdx, dst, src); + vex_prefix_0F38(rdx, dst, src, false); emit_int8((unsigned char)0xF3); emit_operand(rdx, src); } void Assembler::blsrl(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(rcx, dst, src); + int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1164,7 +1420,7 @@ void Assembler::blsrl(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(rcx, dst, src); + vex_prefix_0F38(rcx, dst, src, false); emit_int8((unsigned char)0xF3); emit_operand(rcx, src); } @@ -1312,22 +1568,36 @@ // NOTE: dbx seems to decode this as comiss even though the // 0x66 is there. Strangly ucomisd comes out correct NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true); + } else { + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); + } } void Assembler::comisd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true); + } else { + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); + } } void Assembler::comiss(XMMRegister dst, Address src) { + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true); } void Assembler::comiss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true); } void Assembler::cpuid() { @@ -1347,36 +1617,61 @@ void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); + } } void Assembler::cvtsd2ss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1F; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); + } } void Assembler::cvtsi2sdl(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); + int encode = 0; + if (VM_Version::supports_evex()) { + encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true); + } else { + encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, false); + } emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvtsi2sdl(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + emit_simd_arith_q(0x2A, dst, src, VEX_SIMD_F2, true); + } else { + emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2); + } } void Assembler::cvtsi2ssl(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, true); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvtsi2ssl(XMMRegister dst, Address src) { + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3); + emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true); } void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) { @@ -1385,6 +1680,10 @@ } void Assembler::cvtss2sd(XMMRegister dst, Address src) { + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3); } @@ -1392,14 +1691,14 @@ void Assembler::cvttsd2sil(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2); + int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvttss2sil(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3); + int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1414,15 +1713,29 @@ void Assembler::divsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); + } } void Assembler::divsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); + } } void Assembler::divss(XMMRegister dst, Address src) { + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } NOT_LP64(assert(VM_Version::supports_sse(), "")); emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3); } @@ -1675,7 +1988,11 @@ void Assembler::movapd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66, true); + } else { + emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66); + } } void Assembler::movaps(XMMRegister dst, XMMRegister src) { @@ -1685,7 +2002,8 @@ void Assembler::movlhps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE); + int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, true, VEX_OPCODE_0F, + false, AVX_128bit); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1698,6 +2016,51 @@ emit_operand(dst, src); } +void Assembler::kmovq(KRegister dst, KRegister src) { + NOT_LP64(assert(VM_Version::supports_evex(), "")); + int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, + true, VEX_OPCODE_0F, true); + emit_int8((unsigned char)0x90); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::kmovq(KRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_evex(), "")); + int dst_enc = dst->encoding(); + int nds_enc = 0; + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_NONE, + VEX_OPCODE_0F, true, AVX_128bit, true, true); + emit_int8((unsigned char)0x90); + emit_operand((Register)dst, src); +} + +void Assembler::kmovq(Address dst, KRegister src) { + NOT_LP64(assert(VM_Version::supports_evex(), "")); + int src_enc = src->encoding(); + int nds_enc = 0; + vex_prefix(dst, nds_enc, src_enc, VEX_SIMD_NONE, + VEX_OPCODE_0F, true, AVX_128bit, true, true); + emit_int8((unsigned char)0x90); + emit_operand((Register)src, dst); +} + +void Assembler::kmovql(KRegister dst, Register src) { + NOT_LP64(assert(VM_Version::supports_evex(), "")); + bool supports_bw = VM_Version::supports_avx512bw(); + VexSimdPrefix pre = supports_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE; + int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, + VEX_OPCODE_0F, supports_bw); + emit_int8((unsigned char)0x92); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::kmovdl(KRegister dst, Register src) { + NOT_LP64(assert(VM_Version::supports_evex(), "")); + VexSimdPrefix pre = VM_Version::supports_avx512bw() ? VEX_SIMD_F2 : VEX_SIMD_NONE; + int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, VEX_OPCODE_0F, false); + emit_int8((unsigned char)0x92); + emit_int8((unsigned char)(0xC0 | encode)); +} void Assembler::movb(Address dst, int imm8) { InstructionMark im(this); @@ -1718,7 +2081,7 @@ void Assembler::movdl(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); + int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, true); emit_int8(0x6E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1726,23 +2089,31 @@ void Assembler::movdl(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // swap src/dst to get correct prefix - int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, true); emit_int8(0x7E); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::movdl(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66); + simd_prefix(dst, src, VEX_SIMD_66, true, VEX_OPCODE_0F); emit_int8(0x6E); emit_operand(dst, src); } void Assembler::movdl(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66); + simd_prefix(dst, src, VEX_SIMD_66, true); emit_int8(0x7E); emit_operand(src, dst); } @@ -1754,11 +2125,17 @@ void Assembler::movdqa(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66); } void Assembler::movdqu(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3); } @@ -1769,8 +2146,11 @@ void Assembler::movdqu(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3); + simd_prefix(dst, src, VEX_SIMD_F3, false); emit_int8(0x7F); emit_operand(src, dst); } @@ -1778,28 +2158,77 @@ // Move Unaligned 256bit Vector void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) { assert(UseAVX > 0, ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector_len); emit_int8(0x6F); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::vmovdqu(XMMRegister dst, Address src) { assert(UseAVX > 0, ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } InstructionMark im(this); - bool vector256 = true; - vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256); + int vector_len = AVX_256bit; + vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false); emit_int8(0x6F); emit_operand(dst, src); } void Assembler::vmovdqu(Address dst, XMMRegister src) { assert(UseAVX > 0, ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } InstructionMark im(this); - bool vector256 = true; + int vector_len = AVX_256bit; // swap src<->dst for encoding assert(src != xnoreg, "sanity"); - vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256); + vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false); + emit_int8(0x7F); + emit_operand(src, dst); +} + +// Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64) +void Assembler::evmovdqu(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 0, ""); + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F, + true, vector_len, false, false); + emit_int8(0x6F); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evmovdqu(XMMRegister dst, Address src, int vector_len) { + assert(UseAVX > 0, ""); + InstructionMark im(this); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false); + } else { + vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false); + } + emit_int8(0x6F); + emit_operand(dst, src); +} + +void Assembler::evmovdqu(Address dst, XMMRegister src, int vector_len) { + assert(UseAVX > 0, ""); + InstructionMark im(this); + assert(src != xnoreg, "sanity"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + // swap src<->dst for encoding + vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false); + } else { + // swap src<->dst for encoding + vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false); + } emit_int8(0x7F); emit_operand(src, dst); } @@ -1845,7 +2274,11 @@ // The selection is done in MacroAssembler::movdbl() and movflt(). void Assembler::movlpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x12, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_simd_arith(0x12, dst, src, VEX_SIMD_66, true); } void Assembler::movq( MMXRegister dst, Address src ) { @@ -1871,7 +2304,13 @@ void Assembler::movq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, true); + } else { + simd_prefix(dst, src, VEX_SIMD_F3, true, VEX_OPCODE_0F); + } emit_int8(0x7E); emit_operand(dst, src); } @@ -1879,7 +2318,14 @@ void Assembler::movq(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, true, + VEX_OPCODE_0F, true, AVX_128bit); + } else { + simd_prefix(dst, src, VEX_SIMD_66, true); + } emit_int8((unsigned char)0xD6); emit_operand(src, dst); } @@ -1902,36 +2348,60 @@ void Assembler::movsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x10, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, true); + } else { + emit_simd_arith(0x10, dst, src, VEX_SIMD_F2); + } } void Assembler::movsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, true); + } else { + emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2); + } } void Assembler::movsd(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + simd_prefix_q(src, xnoreg, dst, VEX_SIMD_F2); + } else { + simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, false); + } emit_int8(0x11); emit_operand(src, dst); } void Assembler::movss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x10, dst, src, VEX_SIMD_F3); + emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, true); } void Assembler::movss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, true); } void Assembler::movss(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3); + simd_prefix(dst, src, VEX_SIMD_F3, false); emit_int8(0x11); emit_operand(src, dst); } @@ -2023,16 +2493,30 @@ void Assembler::mulsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); + } } void Assembler::mulsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); + } } void Assembler::mulss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } emit_simd_arith(0x59, dst, src, VEX_SIMD_F3); } @@ -2332,22 +2816,30 @@ void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); - emit_simd_arith(0x67, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_simd_arith(0x67, dst, src, VEX_SIMD_66, + false, (VM_Version::supports_avx512dq() == false)); } void Assembler::packuswb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x67, dst, src, VEX_SIMD_66); + emit_simd_arith(0x67, dst, src, VEX_SIMD_66, + false, (VM_Version::supports_avx512dq() == false)); } -void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "some form of AVX must be enabled"); + emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len, + false, (VM_Version::supports_avx512dq() == false)); } -void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256) { +void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { assert(VM_Version::supports_avx2(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector256); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_3A, true, vector_len); emit_int8(0x00); emit_int8(0xC0 | encode); emit_int8(imm8); @@ -2361,7 +2853,8 @@ void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) { assert(VM_Version::supports_sse4_2(), ""); InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); + simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_3A, + false, AVX_128bit, true); emit_int8(0x61); emit_operand(dst, src); emit_int8(imm8); @@ -2369,7 +2862,8 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_2(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_3A, false, AVX_128bit, true); emit_int8(0x61); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2377,7 +2871,8 @@ void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A, + false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2385,7 +2880,8 @@ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A, + false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2393,7 +2889,8 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A, + false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); emit_int8(0x22); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2401,7 +2898,8 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A, + false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); emit_int8(0x22); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2409,15 +2907,18 @@ void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_HVM; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38); emit_int8(0x30); emit_operand(dst, src); } void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38); emit_int8(0x30); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2520,15 +3021,20 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_ssse3(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); emit_int8(0x00); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::pshufb(XMMRegister dst, Address src) { assert(VM_Version::supports_ssse3(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); emit_int8(0x00); emit_operand(dst, src); } @@ -2545,8 +3051,12 @@ assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66); + simd_prefix(dst, src, VEX_SIMD_66, false); emit_int8(0x70); emit_operand(dst, src); emit_int8(mode & 0xFF); @@ -2555,7 +3065,8 @@ void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2); + emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, false, + (VM_Version::supports_avx512bw() == false)); emit_int8(mode & 0xFF); } @@ -2563,8 +3074,12 @@ assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F2); + simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, false, VEX_OPCODE_0F, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); emit_int8(0x70); emit_operand(dst, src); emit_int8(mode & 0xFF); @@ -2573,7 +3088,8 @@ void Assembler::psrldq(XMMRegister dst, int shift) { // Shift 128 bit value in xmm register by number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift); @@ -2583,14 +3099,15 @@ assert(VM_Version::supports_sse4_1(), ""); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38); emit_int8(0x17); emit_operand(dst, src); } void Assembler::ptest(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, + false, VEX_OPCODE_0F_38); emit_int8(0x17); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2598,19 +3115,20 @@ void Assembler::vptest(XMMRegister dst, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - bool vector256 = true; + int vector_len = AVX_256bit; assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256); + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); emit_int8(0x17); emit_operand(dst, src); } void Assembler::vptest(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_38); emit_int8(0x17); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2618,6 +3136,9 @@ void Assembler::punpcklbw(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } emit_simd_arith(0x60, dst, src, VEX_SIMD_66); } @@ -2629,6 +3150,10 @@ void Assembler::punpckldq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } emit_simd_arith(0x62, dst, src, VEX_SIMD_66); } @@ -2838,12 +3363,22 @@ void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); + } } void Assembler::sqrtsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); + } } void Assembler::sqrtss(XMMRegister dst, XMMRegister src) { @@ -2857,6 +3392,10 @@ void Assembler::sqrtss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } emit_simd_arith(0x51, dst, src, VEX_SIMD_F3); } @@ -2907,12 +3446,20 @@ void Assembler::subsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2); + } } void Assembler::subsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + } + emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2); } void Assembler::subss(XMMRegister dst, XMMRegister src) { @@ -2922,6 +3469,10 @@ void Assembler::subss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3); } @@ -2978,22 +3529,36 @@ void Assembler::ucomisd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true); + } else { + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); + } } void Assembler::ucomisd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true); + } else { + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); + } } void Assembler::ucomiss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true); } void Assembler::ucomiss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true); } void Assembler::xabort(int8_t imm8) { @@ -3075,82 +3640,138 @@ void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } else { + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit); + } } void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } //====================VECTOR ARITHMETIC===================================== @@ -3159,7 +3780,11 @@ void Assembler::addpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x58, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x58, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x58, dst, src, VEX_SIMD_66); + } } void Assembler::addps(XMMRegister dst, XMMRegister src) { @@ -3167,29 +3792,47 @@ emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE); } -void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len); } -void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::subpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5C, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x5C, dst, src, VEX_SIMD_66); + } } void Assembler::subps(XMMRegister dst, XMMRegister src) { @@ -3197,29 +3840,47 @@ emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE); } -void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len); } -void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::mulpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x59, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x59, dst, src, VEX_SIMD_66); + } } void Assembler::mulps(XMMRegister dst, XMMRegister src) { @@ -3227,29 +3888,47 @@ emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE); } -void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len); } -void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::divpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x5E, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x5E, dst, src, VEX_SIMD_66); + } } void Assembler::divps(XMMRegister dst, XMMRegister src) { @@ -3257,118 +3936,199 @@ emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE); } -void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len); } -void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::andpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x54, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true); + } } void Assembler::andps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE); + emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, false, + (VM_Version::supports_avx512dq() == false)); } void Assembler::andps(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, + false, (VM_Version::supports_avx512dq() == false)); } void Assembler::andpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x54, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true); + } } -void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true); + } } -void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256); + bool legacy_mode = (VM_Version::supports_avx512dq() == false); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, legacy_mode); } -void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true); + } } -void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, + (VM_Version::supports_avx512dq() == false)); } void Assembler::xorpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x57, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true); + } } void Assembler::xorps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE); + emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, + false, (VM_Version::supports_avx512dq() == false)); } void Assembler::xorpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x57, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true); + } } void Assembler::xorps(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, false, + (VM_Version::supports_avx512dq() == false)); } -void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true); + } } -void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, + (VM_Version::supports_avx512dq() == false)); } -void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true); + } } -void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { +void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, + (VM_Version::supports_avx512dq() == false)); } - // Integer vector arithmetic -void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); +void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx() && (vector_len == 0) || + VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, + VEX_OPCODE_0F_38, true, false); emit_int8(0x01); emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); +void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx() && (vector_len == 0) || + VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, + VEX_OPCODE_0F_38, true, false); emit_int8(0x02); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3390,61 +4150,89 @@ void Assembler::paddq(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xD4, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0xD4, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0xD4, dst, src, VEX_SIMD_66); + } } void Assembler::phaddw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse3(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8(0x01); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::phaddd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse3(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8(0x02); emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } + emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } + emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); + } } void Assembler::psubb(XMMRegister dst, XMMRegister src) { @@ -3464,84 +4252,149 @@ void Assembler::psubq(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xFB, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0xFB, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0xFB, dst, src, VEX_SIMD_66); + } } -void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); + } } -void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } + emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } + emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); + } } void Assembler::pmullw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xD5, dst, src, VEX_SIMD_66); + emit_simd_arith(0xD5, dst, src, VEX_SIMD_66, + (VM_Version::supports_avx512bw() == false)); } void Assembler::pmulld(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, + false, VEX_OPCODE_0F_38); emit_int8(0x40); emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); +} + +void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_38); + emit_int8(0x40); + emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); +void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 2, "requires some form of AVX"); + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, + VEX_OPCODE_0F_38, true, vector_len, false, false); emit_int8(0x40); emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FVM; + } + emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256); + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, + VEX_OPCODE_0F_38, false, vector_len); + emit_int8(0x40); + emit_operand(dst, src); +} + +void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + } + InstructionMark im(this); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len); emit_int8(0x40); emit_operand(dst, src); } @@ -3550,7 +4403,8 @@ void Assembler::psllw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM6 is for /6 encoding: 66 0F 71 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); emit_int8(0x71); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3559,7 +4413,7 @@ void Assembler::pslld(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM6 is for /6 encoding: 66 0F 72 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false); emit_int8(0x72); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3568,7 +4422,7 @@ void Assembler::psllq(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM6 is for /6 encoding: 66 0F 73 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3576,7 +4430,8 @@ void Assembler::psllw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66); + emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, false, + (VM_Version::supports_avx512bw() == false)); } void Assembler::pslld(XMMRegister dst, XMMRegister shift) { @@ -3586,50 +4441,65 @@ void Assembler::psllq(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0xF3, dst, shift, VEX_SIMD_66); + } else { + emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66); + } } -void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM6 is for /6 encoding: 66 0F 71 /6 ib - emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256); + emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); emit_int8(shift & 0xFF); } -void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM6 is for /6 encoding: 66 0F 72 /6 ib - emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256); + emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector_len); emit_int8(shift & 0xFF); } -void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM6 is for /6 encoding: 66 0F 73 /6 ib - emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x73, xmm6, dst, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector_len); + } emit_int8(shift & 0xFF); } -void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector_len); } -void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0xF3, dst, src, shift, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector_len); + } } // Shift packed integers logically right by specified number of bits. void Assembler::psrlw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM2 is for /2 encoding: 66 0F 71 /2 ib - int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, + (VM_Version::supports_avx512bw() == false)); emit_int8(0x71); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3638,7 +4508,7 @@ void Assembler::psrld(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM2 is for /2 encoding: 66 0F 72 /2 ib - int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false); emit_int8(0x72); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3649,7 +4519,12 @@ // shifts 128 bit value in xmm register by number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM2 is for /2 encoding: 66 0F 73 /2 ib - int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); + int encode = 0; + if (VM_Version::supports_evex() && VM_Version::supports_avx512bw()) { + encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false); + } else { + encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true); + } emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3657,7 +4532,8 @@ void Assembler::psrlw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66); + emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, false, + (VM_Version::supports_avx512bw() == false)); } void Assembler::psrld(XMMRegister dst, XMMRegister shift) { @@ -3667,50 +4543,65 @@ void Assembler::psrlq(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0xD3, dst, shift, VEX_SIMD_66); + } else { + emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66); + } } -void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM2 is for /2 encoding: 66 0F 73 /2 ib - emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256); + emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); emit_int8(shift & 0xFF); } -void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM2 is for /2 encoding: 66 0F 73 /2 ib - emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256); + emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector_len); emit_int8(shift & 0xFF); } -void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM2 is for /2 encoding: 66 0F 73 /2 ib - emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x73, xmm2, dst, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector_len); + } emit_int8(shift & 0xFF); } -void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector_len); } -void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0xD3, dst, src, shift, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector_len); + } } // Shift packed integers arithmetically right by specified number of bits. void Assembler::psraw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM4 is for /4 encoding: 66 0F 71 /4 ib - int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, + (VM_Version::supports_avx512bw() == false)); emit_int8(0x71); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3719,7 +4610,7 @@ void Assembler::psrad(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM4 is for /4 encoding: 66 0F 72 /4 ib - int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false); emit_int8(0x72); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -3727,7 +4618,8 @@ void Assembler::psraw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66); + emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66, + (VM_Version::supports_avx512bw() == false)); } void Assembler::psrad(XMMRegister dst, XMMRegister shift) { @@ -3735,28 +4627,30 @@ emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66); } -void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM4 is for /4 encoding: 66 0F 71 /4 ib - emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256); + emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); emit_int8(shift & 0xFF); } -void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); +void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); // XMM4 is for /4 encoding: 66 0F 71 /4 ib - emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256); + emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector_len); emit_int8(shift & 0xFF); } -void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len, + (VM_Version::supports_avx512bw() == false)); } -void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256); +void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector_len); } @@ -3766,14 +4660,18 @@ emit_simd_arith(0xDB, dst, src, VEX_SIMD_66); } -void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::por(XMMRegister dst, XMMRegister src) { @@ -3781,14 +4679,18 @@ emit_simd_arith(0xEB, dst, src, VEX_SIMD_66); } -void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::pxor(XMMRegister dst, XMMRegister src) { @@ -3796,21 +4698,25 @@ emit_simd_arith(0xEF, dst, src, VEX_SIMD_66); } -void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len); } -void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256); +void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_32bit; + } + emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x18); emit_int8((unsigned char)(0xC0 | encode)); // 0x00 - insert into lower 128 bits @@ -3818,14 +4724,51 @@ emit_int8(0x01); } +void Assembler::vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, + VEX_OPCODE_0F_3A, true, vector_len, false, false); + emit_int8(0x1A); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x00 - insert into lower 256 bits + // 0x01 - insert into upper 256 bits + emit_int8(0x01); +} + +void Assembler::vinsertf64x4h(XMMRegister dst, Address src) { + assert(VM_Version::supports_avx(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T4; + input_size_in_bits = EVEX_64bit; + } + InstructionMark im(this); + int vector_len = AVX_512bit; + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector_len); + emit_int8(0x1A); + emit_operand(dst, src); + // 0x01 - insert into upper 128 bits + emit_int8(0x01); +} + void Assembler::vinsertf128h(XMMRegister dst, Address src) { assert(VM_Version::supports_avx(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T4; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - bool vector256 = true; + int vector_len = AVX_256bit; assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); emit_int8(0x18); emit_operand(dst, src); // 0x01 - insert into upper 128 bits @@ -3834,8 +4777,8 @@ void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x19); emit_int8((unsigned char)(0xC0 | encode)); // 0x00 - insert into lower 128 bits @@ -3845,11 +4788,15 @@ void Assembler::vextractf128h(Address dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T4; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - bool vector256 = true; + int vector_len = AVX_256bit; assert(src != xnoreg, "sanity"); int src_enc = src->encoding(); - vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); emit_int8(0x19); emit_operand(src, dst); // 0x01 - extract from upper 128 bits @@ -3858,8 +4805,8 @@ void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x38); emit_int8((unsigned char)(0xC0 | encode)); // 0x00 - insert into lower 128 bits @@ -3867,38 +4814,169 @@ emit_int8(0x01); } +void Assembler::vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + VM_Version::supports_avx512dq(), vector_len, false, false); + emit_int8(0x38); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x00 - insert into lower 256 bits + // 0x01 - insert into upper 256 bits + emit_int8(0x01); +} + void Assembler::vinserti128h(XMMRegister dst, Address src) { assert(VM_Version::supports_avx2(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T4; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - bool vector256 = true; + int vector_len = AVX_256bit; assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); emit_int8(0x38); emit_operand(dst, src); // 0x01 - insert into upper 128 bits emit_int8(0x01); } +void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_avx(), ""); + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); + emit_int8(0x39); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x00 - insert into lower 128 bits + // 0x01 - insert into upper 128 bits + emit_int8(0x01); +} + void Assembler::vextracti128h(Address dst, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T4; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - bool vector256 = true; + int vector_len = AVX_256bit; assert(src != xnoreg, "sanity"); int src_enc = src->encoding(); - vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); emit_int8(0x39); emit_operand(src, dst); // 0x01 - extract from upper 128 bits emit_int8(0x01); } +void Assembler::vextracti64x4h(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + true, vector_len, false, false); + emit_int8(0x3B); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x01 - extract from upper 256 bits + emit_int8(0x01); +} + +void Assembler::vextracti64x2h(XMMRegister dst, XMMRegister src, int value) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + VM_Version::supports_avx512dq(), vector_len, false, false); + emit_int8(0x39); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x01 - extract from bits 255:128 + // 0x02 - extract from bits 383:256 + // 0x03 - extract from bits 511:384 + emit_int8(value & 0x3); +} + +void Assembler::vextractf64x4h(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + VM_Version::supports_avx512dq(), vector_len, false, false); + emit_int8(0x1B); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x01 - extract from upper 256 bits + emit_int8(0x01); +} + +void Assembler::vextractf64x4h(Address dst, XMMRegister src) { + assert(VM_Version::supports_avx2(), ""); + tuple_type = EVEX_T4; + input_size_in_bits = EVEX_64bit; + InstructionMark im(this); + int vector_len = AVX_512bit; + assert(src != xnoreg, "sanity"); + int src_enc = src->encoding(); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + VM_Version::supports_avx512dq(), vector_len); + emit_int8(0x1B); + emit_operand(src, dst); + // 0x01 - extract from upper 128 bits + emit_int8(0x01); +} + +void Assembler::vextractf32x4h(XMMRegister dst, XMMRegister src, int value) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, + VEX_OPCODE_0F_3A, false, vector_len, false, false); + emit_int8(0x19); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x01 - extract from bits 255:128 + // 0x02 - extract from bits 383:256 + // 0x03 - extract from bits 511:384 + emit_int8(value & 0x3); +} + +void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + VM_Version::supports_avx512dq(), vector_len, false, false); + emit_int8(0x19); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x01 - extract from bits 255:128 + // 0x02 - extract from bits 383:256 + // 0x03 - extract from bits 511:384 + emit_int8(value & 0x3); +} + // duplicate 4-bytes integer data from src into 8 locations in dest void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); + int vector_len = AVX_256bit; + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_38, false); + emit_int8(0x58); + emit_int8((unsigned char)(0xC0 | encode)); +} + +// duplicate 4-bytes integer data from src into 8 locations in dest +void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_38, false); emit_int8(0x58); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3906,7 +4984,8 @@ // Carry-Less Multiplication Quadword void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { assert(VM_Version::supports_clmul(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_3A, false, AVX_128bit, true); emit_int8(0x44); emit_int8((unsigned char)(0xC0 | encode)); emit_int8((unsigned char)mask); @@ -3915,8 +4994,9 @@ // Carry-Less Multiplication Quadword void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) { assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), ""); - bool vector256 = false; - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + int vector_len = AVX_128bit; + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_3A, true); emit_int8(0x44); emit_int8((unsigned char)(0xC0 | encode)); emit_int8((unsigned char)mask); @@ -3924,8 +5004,11 @@ void Assembler::vzeroupper() { assert(VM_Version::supports_avx(), ""); - (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE); - emit_int8(0x77); + if (UseAVX < 3) + { + (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE); + emit_int8(0x77); + } } @@ -4442,7 +5525,7 @@ } -void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) { +void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, int vector_len) { if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) { prefix(VEX_3bytes); @@ -4452,7 +5535,7 @@ emit_int8(byte1); int byte2 = ((~nds_enc) & 0xf) << 3; - byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre; + byte2 |= (vex_w ? VEX_W : 0) | ((vector_len > 0) ? 4 : 0) | pre; emit_int8(byte2); } else { prefix(VEX_2bytes); @@ -4460,89 +5543,237 @@ int byte1 = vex_r ? VEX_R : 0; byte1 = (~byte1) & 0x80; byte1 |= ((~nds_enc) & 0xf) << 3; - byte1 |= (vector256 ? 4 : 0) | pre; + byte1 |= ((vector_len > 0 ) ? 4 : 0) | pre; emit_int8(byte1); } } -void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){ +// This is a 4 byte encoding +void Assembler::evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v, + int nds_enc, VexSimdPrefix pre, VexOpcode opc, + bool is_extended_context, bool is_merge_context, + int vector_len, bool no_mask_reg ){ + // EVEX 0x62 prefix + prefix(EVEX_4bytes); + evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0); + + // P0: byte 2, initialized to RXBR`00mm + // instead of not'd + int byte2 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0) | (evex_r ? EVEX_Rb : 0); + byte2 = (~byte2) & 0xF0; + // confine opc opcode extensions in mm bits to lower two bits + // of form {0F, 0F_38, 0F_3A} + byte2 |= opc; + emit_int8(byte2); + + // P1: byte 3 as Wvvvv1pp + int byte3 = ((~nds_enc) & 0xf) << 3; + // p[10] is always 1 + byte3 |= EVEX_F; + byte3 |= (vex_w & 1) << 7; + // confine pre opcode extensions in pp bits to lower two bits + // of form {66, F3, F2} + byte3 |= pre; + emit_int8(byte3); + + // P2: byte 4 as zL'Lbv'aaa + int byte4 = (no_mask_reg) ? 0 : 1; // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now) + // EVEX.v` for extending EVEX.vvvv or VIDX + byte4 |= (evex_v ? 0: EVEX_V); + // third EXEC.b for broadcast actions + byte4 |= (is_extended_context ? EVEX_Rb : 0); + // fourth EVEX.L'L for vector length : 0 is 128, 1 is 256, 2 is 512, currently we do not support 1024 + byte4 |= ((vector_len) & 0x3) << 5; + // last is EVEX.z for zero/merge actions + byte4 |= (is_merge_context ? EVEX_Z : 0); + emit_int8(byte4); +} + +void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, + VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) { bool vex_r = (xreg_enc >= 8); bool vex_b = adr.base_needs_rex(); bool vex_x = adr.index_needs_rex(); - vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256); + avx_vector_len = vector_len; + + // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit + if (VM_Version::supports_avx512vl() == false) { + switch (vector_len) { + case AVX_128bit: + case AVX_256bit: + legacy_mode = true; + break; + } + } + + if ((UseAVX > 2) && (legacy_mode == false)) + { + bool evex_r = (xreg_enc >= 16); + bool evex_v = (nds_enc >= 16); + is_evex_instruction = true; + evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg); + } else { + vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len); + } } -int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) { +int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, + bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) { bool vex_r = (dst_enc >= 8); bool vex_b = (src_enc >= 8); bool vex_x = false; - vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256); + avx_vector_len = vector_len; + + // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit + if (VM_Version::supports_avx512vl() == false) { + switch (vector_len) { + case AVX_128bit: + case AVX_256bit: + legacy_mode = true; + break; + } + } + + if ((UseAVX > 2) && (legacy_mode == false)) + { + bool evex_r = (dst_enc >= 16); + bool evex_v = (nds_enc >= 16); + // can use vex_x as bank extender on rm encoding + vex_x = (src_enc >= 16); + evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg); + } else { + vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len); + } + + // return modrm byte components for operands return (((dst_enc & 7) << 3) | (src_enc & 7)); } -void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) { +void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, + bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len, bool legacy_mode) { if (UseAVX > 0) { int xreg_enc = xreg->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256); + vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector_len, legacy_mode, no_mask_reg); } else { assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding"); rex_prefix(adr, xreg, pre, opc, rex_w); } } -int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) { +int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, + bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len, bool legacy_mode) { int dst_enc = dst->encoding(); int src_enc = src->encoding(); if (UseAVX > 0) { int nds_enc = nds->is_valid() ? nds->encoding() : 0; - return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256); + return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector_len, legacy_mode, no_mask_reg); } else { assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding"); return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w); } } -void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) { +int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre, + bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len) { + int dst_enc = dst->encoding(); + int src_enc = src->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector_len, true, no_mask_reg); +} + +int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre, + bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len) { + int dst_enc = dst->encoding(); + int src_enc = src->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector_len, true, no_mask_reg); +} + +void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) { + InstructionMark im(this); + simd_prefix(dst, dst, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode); + emit_int8(opcode); + emit_operand(dst, src); +} + +void Assembler::emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg) { InstructionMark im(this); - simd_prefix(dst, dst, src, pre); + simd_prefix_q(dst, dst, src, pre, no_mask_reg); emit_int8(opcode); emit_operand(dst, src); } -void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) { - int encode = simd_prefix_and_encode(dst, dst, src, pre); +void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) { + int encode = simd_prefix_and_encode(dst, dst, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode); + emit_int8(opcode); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) { + int encode = simd_prefix_and_encode(dst, dst, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit); emit_int8(opcode); emit_int8((unsigned char)(0xC0 | encode)); } // Versions with no second source register (non-destructive source). -void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) { +void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool opNoRegMask) { InstructionMark im(this); - simd_prefix(dst, xnoreg, src, pre); + simd_prefix(dst, xnoreg, src, pre, opNoRegMask); emit_int8(opcode); emit_operand(dst, src); } -void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) { - int encode = simd_prefix_and_encode(dst, xnoreg, src, pre); +void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool opNoRegMask) { + InstructionMark im(this); + simd_prefix_q(dst, xnoreg, src, pre, opNoRegMask); + emit_int8(opcode); + emit_operand(dst, src); +} + +void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) { + int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, legacy_mode, AVX_128bit); + emit_int8(opcode); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) { + int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit); emit_int8(opcode); emit_int8((unsigned char)(0xC0 | encode)); } // 3-operands AVX instructions -void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, - Address src, VexSimdPrefix pre, bool vector256) { +void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, Address src, + VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) { InstructionMark im(this); - vex_prefix(dst, nds, src, pre, vector256); + vex_prefix(dst, nds, src, pre, vector_len, no_mask_reg, legacy_mode); emit_int8(opcode); emit_operand(dst, src); } -void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, - XMMRegister src, VexSimdPrefix pre, bool vector256) { - int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256); +void Assembler::emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, + Address src, VexSimdPrefix pre, int vector_len, bool no_mask_reg) { + InstructionMark im(this); + vex_prefix_q(dst, nds, src, pre, vector_len, no_mask_reg); + emit_int8(opcode); + emit_operand(dst, src); +} + +void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, + VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) { + int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, false, no_mask_reg); + emit_int8(opcode); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, + VexSimdPrefix pre, int vector_len, bool no_mask_reg) { + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg); emit_int8(opcode); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5040,6 +6271,10 @@ } void Assembler::andnq(Register dst, Register src1, Address src2) { + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + } InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); vex_prefix_0F38_q(dst, src1, src2); @@ -5181,44 +6416,52 @@ void Assembler::cvtsi2sdq(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2); + int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvtsi2sdq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - simd_prefix_q(dst, dst, src, VEX_SIMD_F2); + simd_prefix_q(dst, dst, src, VEX_SIMD_F2, true); emit_int8(0x2A); emit_operand(dst, src); } void Assembler::cvtsi2ssq(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3); + int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvtsi2ssq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + } InstructionMark im(this); - simd_prefix_q(dst, dst, src, VEX_SIMD_F3); + simd_prefix_q(dst, dst, src, VEX_SIMD_F3, true); emit_int8(0x2A); emit_operand(dst, src); } void Assembler::cvttsd2siq(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2); + int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvttss2siq(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3); + int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5387,7 +6630,7 @@ void Assembler::movdq(XMMRegister dst, Register src) { // table D-1 says MMX/SSE2 NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66); + int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, true); emit_int8(0x6E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5396,7 +6639,7 @@ // table D-1 says MMX/SSE2 NOT_LP64(assert(VM_Version::supports_sse2(), "")); // swap src/dst to get correct prefix - int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66); + int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, true); emit_int8(0x7E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5529,7 +6772,8 @@ void Assembler::mulxq(Register dst1, Register dst2, Register src) { assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); - int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false); + int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), + VEX_SIMD_F2, VEX_OPCODE_0F_38, true, AVX_128bit, true, false); emit_int8((unsigned char)0xF6); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5678,7 +6922,8 @@ void Assembler::rorxq(Register dst, Register src, int imm8) { assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, + VEX_OPCODE_0F_3A, true, AVX_128bit, true, false); emit_int8((unsigned char)0xF0); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-04-23 08:25:15.099343300 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-04-23 08:25:14.898343300 -0700 @@ -436,7 +436,7 @@ }; -const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize); +const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize); // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write @@ -501,7 +501,8 @@ REX_WRXB = 0x4F, VEX_3bytes = 0xC4, - VEX_2bytes = 0xC5 + VEX_2bytes = 0xC5, + EVEX_4bytes = 0x62 }; enum VexPrefix { @@ -511,6 +512,14 @@ VEX_W = 0x80 }; + enum ExexPrefix { + EVEX_F = 0x04, + EVEX_V = 0x08, + EVEX_Rb = 0x10, + EVEX_X = 0x40, + EVEX_Z = 0x80 + }; + enum VexSimdPrefix { VEX_SIMD_NONE = 0x0, VEX_SIMD_66 = 0x1, @@ -525,6 +534,37 @@ VEX_OPCODE_0F_3A = 0x3 }; + enum AvxVectorLen { + AVX_128bit = 0x0, + AVX_256bit = 0x1, + AVX_512bit = 0x2, + AVX_NoVec = 0x4 + }; + + enum EvexTupleType { + EVEX_FV = 0, + EVEX_HV = 4, + EVEX_FVM = 6, + EVEX_T1S = 7, + EVEX_T1F = 11, + EVEX_T2 = 13, + EVEX_T4 = 15, + EVEX_T8 = 17, + EVEX_HVM = 18, + EVEX_QVM = 19, + EVEX_OVM = 20, + EVEX_M128 = 21, + EVEX_DUP = 22, + EVEX_ETUP = 23 + }; + + enum EvexInputSizeInBits { + EVEX_8bit = 0, + EVEX_16bit = 1, + EVEX_32bit = 2, + EVEX_64bit = 3 + }; + enum WhichOperand { // input to locate_operand, and format code for relocations imm_operand = 0, // embedded 32-bit|64-bit immediate operand @@ -552,6 +592,11 @@ private: + int evex_encoding; + int input_size_in_bits; + int avx_vector_len; + int tuple_type; + bool is_evex_instruction; // 64bit prefixes int prefix_and_encode(int reg_enc, bool byteinst = false); @@ -578,108 +623,143 @@ void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, - bool vector256); + int vector_len); + + void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v, + int nds_enc, VexSimdPrefix pre, VexOpcode opc, + bool is_extended_context, bool is_merge_context, + int vector_len, bool no_mask_reg ); void vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, - bool vex_w, bool vector256); + bool vex_w, int vector_len, + bool legacy_mode = false, bool no_mask_reg = false); void vex_prefix(XMMRegister dst, XMMRegister nds, Address src, - VexSimdPrefix pre, bool vector256 = false) { + VexSimdPrefix pre, int vector_len = AVX_128bit, + bool no_mask_reg = false, bool legacy_mode = false) { int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256); + vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg); } - void vex_prefix_0F38(Register dst, Register nds, Address src) { + void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src, + VexSimdPrefix pre, int vector_len = AVX_128bit, + bool no_mask_reg = false) { + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg); + } + + void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) { bool vex_w = false; - bool vector256 = false; + int vector_len = AVX_128bit; vex_prefix(src, nds->encoding(), dst->encoding(), - VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); + VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, + vector_len, no_mask_reg); } - void vex_prefix_0F38_q(Register dst, Register nds, Address src) { + void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) { bool vex_w = true; - bool vector256 = false; + int vector_len = AVX_128bit; vex_prefix(src, nds->encoding(), dst->encoding(), - VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); + VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, + vector_len, no_mask_reg); } int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, - bool vex_w, bool vector256); + bool vex_w, int vector_len, + bool legacy_mode, bool no_mask_reg); - int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) { + int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) { bool vex_w = false; - bool vector256 = false; + int vector_len = AVX_128bit; return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), - VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); + VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len, + false, no_mask_reg); } - int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) { + int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) { bool vex_w = true; - bool vector256 = false; + int vector_len = AVX_128bit; return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), - VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256); + VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len, + false, no_mask_reg); } int vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, - VexSimdPrefix pre, bool vector256 = false, - VexOpcode opc = VEX_OPCODE_0F) { + VexSimdPrefix pre, int vector_len = AVX_128bit, + VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false, + bool no_mask_reg = false) { int src_enc = src->encoding(); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256); + return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg); } void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, - VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, - bool rex_w = false, bool vector256 = false); + VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F, + bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false); - void simd_prefix(XMMRegister dst, Address src, - VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { - simd_prefix(dst, xnoreg, src, pre, opc); + void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre, + bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) { + simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc); } - void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) { - simd_prefix(src, dst, pre); + void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) { + simd_prefix(src, dst, pre, no_mask_reg); } void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src, - VexSimdPrefix pre) { + VexSimdPrefix pre, bool no_mask_reg = false) { bool rex_w = true; - simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w); + simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w); } int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, - VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, - bool rex_w = false, bool vector256 = false); + VexSimdPrefix pre, bool no_mask_reg, + VexOpcode opc = VEX_OPCODE_0F, + bool rex_w = false, int vector_len = AVX_128bit, + bool legacy_mode = false); + + int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, + VexSimdPrefix pre, bool no_mask_reg, + VexOpcode opc = VEX_OPCODE_0F, + bool rex_w = false, int vector_len = AVX_128bit); + + int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, + VexSimdPrefix pre, bool no_mask_reg, + VexOpcode opc = VEX_OPCODE_0F, + bool rex_w = false, int vector_len = AVX_128bit); // Move/convert 32-bit integer value. int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src, - VexSimdPrefix pre) { + VexSimdPrefix pre, bool no_mask_reg) { // It is OK to cast from Register to XMMRegister to pass argument here // since only encoding is used in simd_prefix_and_encode() and number of // Gen and Xmm registers are the same. - return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre); + return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F); } - int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) { - return simd_prefix_and_encode(dst, xnoreg, src, pre); + int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) { + return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg); } int simd_prefix_and_encode(Register dst, XMMRegister src, - VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { - return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc); + VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, + bool no_mask_reg = false) { + return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc); } // Move/convert 64-bit integer value. int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src, - VexSimdPrefix pre) { + VexSimdPrefix pre, bool no_mask_reg = false) { bool rex_w = true; - return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w); + return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w); } - int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) { - return simd_prefix_and_encode_q(dst, xnoreg, src, pre); + int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) { + return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg); } int simd_prefix_and_encode_q(Register dst, XMMRegister src, - VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { + VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, + bool no_mask_reg = false) { bool rex_w = true; - return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w); + return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w); } // Helper functions for groups of instructions @@ -690,14 +770,28 @@ void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32); void emit_arith(int op1, int op2, Register dst, Register src); - void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre); - void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre); - void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre); - void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre); + void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false); + void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false); + void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false); + void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false); + void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false); + void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false); + void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false); + void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false); void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, - Address src, VexSimdPrefix pre, bool vector256); + Address src, VexSimdPrefix pre, int vector_len, + bool no_mask_reg = false, bool legacy_mode = false); + void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, + Address src, VexSimdPrefix pre, int vector_len, + bool no_mask_reg = false); void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, - XMMRegister src, VexSimdPrefix pre, bool vector256); + XMMRegister src, VexSimdPrefix pre, int vector_len, + bool no_mask_reg = false, bool legacy_mode = false); + void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, + XMMRegister src, VexSimdPrefix pre, int vector_len, + bool no_mask_reg = false); + + bool emit_compressed_disp_byte(int &disp); void emit_operand(Register reg, Register base, Register index, Address::ScaleFactor scale, @@ -823,7 +917,9 @@ public: // Creation - Assembler(CodeBuffer* code) : AbstractAssembler(code) {} + Assembler(CodeBuffer* code) : AbstractAssembler(code) { + init_attributes(); + } // Decoding static address locate_operand(address inst, WhichOperand which); @@ -831,11 +927,21 @@ // Utilities static bool is_polling_page_far() NOT_LP64({ return false;}); + static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len, + int cur_tuple_type, int in_size_in_bits, int cur_encoding); // Generic instructions // Does 32bit or 64bit as needed for the platform. In some sense these // belong in macro assembler but there is no need for both varieties to exist + void init_attributes(void) { + evex_encoding = 0; + input_size_in_bits = 0; + avx_vector_len = AVX_NoVec; + tuple_type = EVEX_ETUP; + is_evex_instruction = false; + } + void lea(Register dst, Address src); void mov(Register dst, Register src); @@ -1336,6 +1442,12 @@ void movb(Address dst, int imm8); void movb(Register dst, Address src); + void kmovq(KRegister dst, KRegister src); + void kmovql(KRegister dst, Register src); + void kmovdl(KRegister dst, Register src); + void kmovq(Address dst, KRegister src); + void kmovq(KRegister dst, Address src); + void movdl(XMMRegister dst, Register src); void movdl(Register dst, XMMRegister src); void movdl(XMMRegister dst, Address src); @@ -1359,6 +1471,11 @@ void vmovdqu(XMMRegister dst, Address src); void vmovdqu(XMMRegister dst, XMMRegister src); + // Move Unaligned 512bit Vector + void evmovdqu(Address dst, XMMRegister src, int vector_len); + void evmovdqu(XMMRegister dst, Address src, int vector_len); + void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len); + // Move lower 64bit to high 64bit in 128bit register void movlhps(XMMRegister dst, XMMRegister src); @@ -1484,10 +1601,10 @@ // Pack with unsigned saturation void packuswb(XMMRegister dst, XMMRegister src); void packuswb(XMMRegister dst, Address src); - void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); // Pemutation of 64bit words - void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256); + void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void pause(); @@ -1732,54 +1849,54 @@ // Add Packed Floating-Point Values void addpd(XMMRegister dst, XMMRegister src); void addps(XMMRegister dst, XMMRegister src); - void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Subtract Packed Floating-Point Values void subpd(XMMRegister dst, XMMRegister src); void subps(XMMRegister dst, XMMRegister src); - void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Multiply Packed Floating-Point Values void mulpd(XMMRegister dst, XMMRegister src); void mulps(XMMRegister dst, XMMRegister src); - void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Divide Packed Floating-Point Values void divpd(XMMRegister dst, XMMRegister src); void divps(XMMRegister dst, XMMRegister src); - void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Bitwise Logical AND of Packed Floating-Point Values void andpd(XMMRegister dst, XMMRegister src); void andps(XMMRegister dst, XMMRegister src); - void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Bitwise Logical XOR of Packed Floating-Point Values void xorpd(XMMRegister dst, XMMRegister src); void xorps(XMMRegister dst, XMMRegister src); - void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Add horizontal packed integers - void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void phaddw(XMMRegister dst, XMMRegister src); void phaddd(XMMRegister dst, XMMRegister src); @@ -1788,36 +1905,38 @@ void paddw(XMMRegister dst, XMMRegister src); void paddd(XMMRegister dst, XMMRegister src); void paddq(XMMRegister dst, XMMRegister src); - void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Sub packed integers void psubb(XMMRegister dst, XMMRegister src); void psubw(XMMRegister dst, XMMRegister src); void psubd(XMMRegister dst, XMMRegister src); void psubq(XMMRegister dst, XMMRegister src); - void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Multiply packed integers (only shorts and ints) void pmullw(XMMRegister dst, XMMRegister src); void pmulld(XMMRegister dst, XMMRegister src); - void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); - void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Shift left packed integers void psllw(XMMRegister dst, int shift); @@ -1826,12 +1945,12 @@ void psllw(XMMRegister dst, XMMRegister shift); void pslld(XMMRegister dst, XMMRegister shift); void psllq(XMMRegister dst, XMMRegister shift); - void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); - void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); - void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // Logical shift right packed integers void psrlw(XMMRegister dst, int shift); @@ -1840,42 +1959,43 @@ void psrlw(XMMRegister dst, XMMRegister shift); void psrld(XMMRegister dst, XMMRegister shift); void psrlq(XMMRegister dst, XMMRegister shift); - void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); - void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); - void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs) void psraw(XMMRegister dst, int shift); void psrad(XMMRegister dst, int shift); void psraw(XMMRegister dst, XMMRegister shift); void psrad(XMMRegister dst, XMMRegister shift); - void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256); - void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); - void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // And packed integers void pand(XMMRegister dst, XMMRegister src); - void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Or packed integers void por(XMMRegister dst, XMMRegister src); - void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); - void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); - void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Copy low 128bit into high 128bit of YMM registers. void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vextractf128h(XMMRegister dst, XMMRegister src); + void vextracti128h(XMMRegister dst, XMMRegister src); // Load/store high 128bit of YMM registers which does not destroy other half. void vinsertf128h(XMMRegister dst, Address src); @@ -1883,9 +2003,25 @@ void vextractf128h(Address dst, XMMRegister src); void vextracti128h(Address dst, XMMRegister src); + // Copy low 256bit into high 256bit of ZMM registers. + void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src); + void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src); + void vextracti64x4h(XMMRegister dst, XMMRegister src); + void vextractf64x4h(XMMRegister dst, XMMRegister src); + void vextractf64x4h(Address dst, XMMRegister src); + void vinsertf64x4h(XMMRegister dst, Address src); + + // Copy targeted 128bit segments of the ZMM registers + void vextracti64x2h(XMMRegister dst, XMMRegister src, int value); + void vextractf64x2h(XMMRegister dst, XMMRegister src, int value); + void vextractf32x4h(XMMRegister dst, XMMRegister src, int value); + // duplicate 4-bytes integer data from src into 8 locations in dest void vpbroadcastd(XMMRegister dst, XMMRegister src); + // duplicate 4-bytes integer data from src into vector_len locations in dest + void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len); + // Carry-Less Multiplication Quadword void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask); --- old/src/cpu/x86/vm/c1_FrameMap_x86.cpp 2015-04-23 08:25:18.986343300 -0700 +++ new/src/cpu/x86/vm/c1_FrameMap_x86.cpp 2015-04-23 08:25:18.769343300 -0700 @@ -233,13 +233,30 @@ _xmm_regs[13] = xmm13; _xmm_regs[14] = xmm14; _xmm_regs[15] = xmm15; + _xmm_regs[16] = xmm16; + _xmm_regs[17] = xmm17; + _xmm_regs[18] = xmm18; + _xmm_regs[19] = xmm19; + _xmm_regs[20] = xmm20; + _xmm_regs[21] = xmm21; + _xmm_regs[22] = xmm22; + _xmm_regs[23] = xmm23; + _xmm_regs[24] = xmm24; + _xmm_regs[25] = xmm25; + _xmm_regs[26] = xmm26; + _xmm_regs[27] = xmm27; + _xmm_regs[28] = xmm28; + _xmm_regs[29] = xmm29; + _xmm_regs[30] = xmm30; + _xmm_regs[31] = xmm31; #endif // _LP64 for (int i = 0; i < 8; i++) { _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i); } - for (int i = 0; i < nof_caller_save_xmm_regs ; i++) { + int num_caller_save_xmm_regs = get_num_caller_save_xmms(); + for (int i = 0; i < num_caller_save_xmm_regs; i++) { _caller_save_xmm_regs[i] = LIR_OprFact::single_xmm(i); } --- old/src/cpu/x86/vm/c1_FrameMap_x86.hpp 2015-04-23 08:25:22.925343300 -0700 +++ new/src/cpu/x86/vm/c1_FrameMap_x86.hpp 2015-04-23 08:25:22.629343300 -0700 @@ -152,6 +152,16 @@ return range; } + static int get_num_caller_save_xmms(void) { + int num_caller_save_xmm_regs = nof_caller_save_xmm_regs; +#ifdef _LP64 + if (UseAVX < 3) { + num_caller_save_xmm_regs = num_caller_save_xmm_regs / 2; + } +#endif + return num_caller_save_xmm_regs; + } + static int nof_caller_save_cpu_regs() { return adjust_reg_range(pd_nof_caller_save_cpu_regs_frame_map); } static int last_cpu_reg() { return adjust_reg_range(pd_last_cpu_reg); } static int last_byte_reg() { return adjust_reg_range(pd_last_byte_reg); } --- old/src/cpu/x86/vm/c1_LinearScan_x86.hpp 2015-04-23 08:25:27.358343300 -0700 +++ new/src/cpu/x86/vm/c1_LinearScan_x86.hpp 2015-04-23 08:25:27.151343300 -0700 @@ -85,8 +85,9 @@ tty->print_cr("killing XMMs for trig"); } #endif + int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms(); int op_id = op->id(); - for (int xmm = 0; xmm < FrameMap::nof_caller_save_xmm_regs; xmm++) { + for (int xmm = 0; xmm < num_caller_save_xmm_regs; xmm++) { LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(xmm); add_temp(reg_num(opr), op_id, noUse, T_ILLEGAL); } @@ -100,6 +101,10 @@ // Implementation of LinearScanWalker inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) { + int last_xmm_reg = pd_last_xmm_reg; + if (UseAVX < 3) { + last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1; + } if (allocator()->gen()->is_vreg_flag_set(cur->reg_num(), LIRGenerator::byte_reg)) { assert(cur->type() != T_FLOAT && cur->type() != T_DOUBLE, "cpu regs only"); _first_reg = pd_first_byte_reg; @@ -107,7 +112,7 @@ return true; } else if ((UseSSE >= 1 && cur->type() == T_FLOAT) || (UseSSE >= 2 && cur->type() == T_DOUBLE)) { _first_reg = pd_first_xmm_reg; - _last_reg = pd_last_xmm_reg; + _last_reg = last_xmm_reg; return true; } --- old/src/cpu/x86/vm/c1_Runtime1_x86.cpp 2015-04-23 08:25:32.486343300 -0700 +++ new/src/cpu/x86/vm/c1_Runtime1_x86.cpp 2015-04-23 08:25:32.132343300 -0700 @@ -323,7 +323,7 @@ LP64_ONLY(num_rt_args = 0); LP64_ONLY(assert((reg_save_frame_size * VMRegImpl::stack_slot_size) % 16 == 0, "must be 16 byte aligned");) int frame_size_in_slots = reg_save_frame_size + num_rt_args; // args + thread - sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word ); + sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word); // record saved value locations in an OopMap // locations are offsets from sp after runtime call; num_rt_args is number of arguments in call, including thread @@ -362,6 +362,13 @@ map->set_callee_saved(VMRegImpl::stack2reg(r15H_off + num_rt_args), r15->as_VMReg()->next()); #endif // _LP64 + int xmm_bypass_limit = FrameMap::nof_xmm_regs; +#ifdef _LP64 + if (UseAVX < 3) { + xmm_bypass_limit = xmm_bypass_limit / 2; + } +#endif + if (save_fpu_registers) { if (UseSSE < 2) { int fpu_off = float_regs_as_doubles_off; @@ -380,11 +387,13 @@ if (UseSSE >= 2) { int xmm_off = xmm_regs_as_doubles_off; for (int n = 0; n < FrameMap::nof_xmm_regs; n++) { - VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg(); - map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0); - // %%% This is really a waste but we'll keep things as they were for now - if (true) { - map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next()); + if (n < xmm_bypass_limit) { + VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg(); + map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0); + // %%% This is really a waste but we'll keep things as they were for now + if (true) { + map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next()); + } } xmm_off += 2; } @@ -393,8 +402,10 @@ } else if (UseSSE == 1) { int xmm_off = xmm_regs_as_doubles_off; for (int n = 0; n < FrameMap::nof_xmm_regs; n++) { - VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg(); - map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0); + if (n < xmm_bypass_limit) { + VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg(); + map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0); + } xmm_off += 2; } assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers"); @@ -474,6 +485,24 @@ __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13); __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14); __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15); + if (UseAVX > 2) { + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31); + } #endif // _LP64 } else if (UseSSE == 1) { // save XMM registers as float because double not supported without SSE2 @@ -516,6 +545,24 @@ __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104)); __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112)); __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120)); + if (UseAVX > 2) { + __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128)); + __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136)); + __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144)); + __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152)); + __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160)); + __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168)); + __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176)); + __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184)); + __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192)); + __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200)); + __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208)); + __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216)); + __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224)); + __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232)); + __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240)); + __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248)); + } #endif // _LP64 } else if (UseSSE == 1) { // restore XMM registers --- old/src/cpu/x86/vm/c2_init_x86.cpp 2015-04-23 08:25:35.952343300 -0700 +++ new/src/cpu/x86/vm/c2_init_x86.cpp 2015-04-23 08:25:35.761343300 -0700 @@ -25,6 +25,7 @@ #include "precompiled.hpp" #include "opto/compile.hpp" #include "opto/node.hpp" +#include "opto/optoreg.hpp" // processor dependent initialization for i486 @@ -37,4 +38,24 @@ ConditionalMoveLimit = 0; } #endif // AMD64 + + if (UseAVX < 3) { + int delta = XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers; + int bottom = ConcreteRegisterImpl::max_fpr; + int top = bottom + delta; + int middle = bottom + (delta / 2); + int xmm_slots = XMMRegisterImpl::max_slots_per_register; + int lower = xmm_slots / 2; + // mark bad every register that we cannot get to if AVX less than 3, we have all slots in the array + // Note: vm2opto is allocated to ConcreteRegisterImpl::number_of_registers + for (int i = bottom; i < middle; i += xmm_slots) { + for (OptoReg::Name j = OptoReg::Name(i + lower); j= 2) { + if (UseAVX > 2) { + movl(rbx, 0xffff); +#ifdef _LP64 + kmovql(k1, rbx); +#else + kmovdl(k1, rbx); +#endif + } #ifdef COMPILER2 if (MaxVectorSize > 16) { assert(UseAVX > 0, "256bit vectors are supported only with AVX"); @@ -7039,8 +7047,39 @@ { assert( UseSSE >= 2, "supported cpu only" ); Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; + if (UseAVX > 2) { + movl(rtmp, 0xffff); +#ifdef _LP64 + kmovql(k1, rtmp); +#else + kmovdl(k1, rtmp); +#endif + } movdl(xtmp, value); - if (UseAVX >= 2 && UseUnalignedLoadStores) { + if (UseAVX > 2 && UseUnalignedLoadStores) { + // Fill 64-byte chunks + Label L_fill_64_bytes_loop, L_check_fill_32_bytes; + evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); + + subl(count, 16 << shift); + jcc(Assembler::less, L_check_fill_32_bytes); + align(16); + + BIND(L_fill_64_bytes_loop); + evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit); + addptr(to, 64); + subl(count, 16 << shift); + jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); + + BIND(L_check_fill_32_bytes); + addl(count, 8 << shift); + jccb(Assembler::less, L_check_fill_8_bytes); + evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit); + addptr(to, 32); + subl(count, 8 << shift); + + BIND(L_check_fill_8_bytes); + } else if (UseAVX == 2 && UseUnalignedLoadStores) { // Fill 64-byte chunks Label L_fill_64_bytes_loop, L_check_fill_32_bytes; vpbroadcastd(xtmp, xtmp); @@ -7175,11 +7214,11 @@ bind(L_copy_32_chars); vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); - vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true); + vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector jccb(Assembler::notZero, L_copy_32_chars_exit); - vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true); - vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true); + vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); + vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); bind(L_chars_32_check); @@ -7202,13 +7241,13 @@ vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); vptest(tmp2Reg, tmp1Reg); jccb(Assembler::notZero, L_copy_16_chars_exit); - vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true); - vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true); + vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); + vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); } else { if (UseAVX > 0) { movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); - vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false); + vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); } else { movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); por(tmp2Reg, tmp3Reg); @@ -7747,7 +7786,7 @@ if (UseAVX > 0) { vpclmulhdq(xtmp, xK, xcrc); // [123:64] vpclmulldq(xcrc, xK, xcrc); // [63:0] - vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); + vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); pxor(xcrc, xtmp); } else { movdqa(xtmp, xcrc); @@ -7891,7 +7930,7 @@ movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); if (UseAVX > 0) { vpclmulqdq(xmm2, xmm0, xmm1, 0x1); - vpand(xmm3, xmm0, xmm2, false /* vector256 */); + vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); vpclmulqdq(xmm0, xmm0, xmm3, 0x1); } else { movdqa(xmm2, xmm0); --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-04-23 08:25:49.079343300 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-04-23 08:25:48.665343300 -0700 @@ -1024,13 +1024,13 @@ void vaddss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vaddss(dst, nds, src); } void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src); - void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); } - void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); } - void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); - - void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); } - void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); } - void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); + void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); } + void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); } + void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + + void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } + void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } + void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); } void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); } @@ -1058,25 +1058,25 @@ // AVX Vector instructions - void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); } - void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); } - void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); - - void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); } - void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); } - void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); - - void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2 - Assembler::vpxor(dst, nds, src, vector256); + void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } + void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } + void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + + void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); } + void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); } + void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + + void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2 + Assembler::vpxor(dst, nds, src, vector_len); else - Assembler::vxorpd(dst, nds, src, vector256); + Assembler::vxorpd(dst, nds, src, vector_len); } - void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { - if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2 - Assembler::vpxor(dst, nds, src, vector256); + void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2 + Assembler::vpxor(dst, nds, src, vector_len); else - Assembler::vxorpd(dst, nds, src, vector256); + Assembler::vxorpd(dst, nds, src, vector_len); } // Simple version for AVX2 256bit vectors --- old/src/cpu/x86/vm/register_definitions_x86.cpp 2015-04-23 08:25:54.825343300 -0700 +++ new/src/cpu/x86/vm/register_definitions_x86.cpp 2015-04-23 08:25:54.486343300 -0700 @@ -68,6 +68,22 @@ REGISTER_DEFINITION(XMMRegister, xmm13); REGISTER_DEFINITION(XMMRegister, xmm14); REGISTER_DEFINITION(XMMRegister, xmm15); +REGISTER_DEFINITION(XMMRegister, xmm16); +REGISTER_DEFINITION(XMMRegister, xmm17); +REGISTER_DEFINITION(XMMRegister, xmm18); +REGISTER_DEFINITION(XMMRegister, xmm19); +REGISTER_DEFINITION(XMMRegister, xmm20); +REGISTER_DEFINITION(XMMRegister, xmm21); +REGISTER_DEFINITION(XMMRegister, xmm22); +REGISTER_DEFINITION(XMMRegister, xmm23); +REGISTER_DEFINITION(XMMRegister, xmm24); +REGISTER_DEFINITION(XMMRegister, xmm25); +REGISTER_DEFINITION(XMMRegister, xmm26); +REGISTER_DEFINITION(XMMRegister, xmm27); +REGISTER_DEFINITION(XMMRegister, xmm28); +REGISTER_DEFINITION(XMMRegister, xmm29); +REGISTER_DEFINITION(XMMRegister, xmm30); +REGISTER_DEFINITION(XMMRegister, xmm31); REGISTER_DEFINITION(Register, c_rarg0); REGISTER_DEFINITION(Register, c_rarg1); @@ -123,5 +139,15 @@ REGISTER_DEFINITION(MMXRegister, mmx6 ); REGISTER_DEFINITION(MMXRegister, mmx7 ); +REGISTER_DEFINITION(KRegister, knoreg); +REGISTER_DEFINITION(KRegister, k0); +REGISTER_DEFINITION(KRegister, k1); +REGISTER_DEFINITION(KRegister, k2); +REGISTER_DEFINITION(KRegister, k3); +REGISTER_DEFINITION(KRegister, k4); +REGISTER_DEFINITION(KRegister, k5); +REGISTER_DEFINITION(KRegister, k6); +REGISTER_DEFINITION(KRegister, k7); + // JSR 292 REGISTER_DEFINITION(Register, rbp_mh_SP_save); --- old/src/cpu/x86/vm/register_x86.cpp 2015-04-23 08:25:58.997343300 -0700 +++ new/src/cpu/x86/vm/register_x86.cpp 2015-04-23 08:25:58.789343300 -0700 @@ -31,11 +31,13 @@ const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1; #endif // AMD64 - const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr + - 2 * FloatRegisterImpl::number_of_registers; + 2 * FloatRegisterImpl::number_of_registers; const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr + - 8 * XMMRegisterImpl::number_of_registers; + XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers; +const int ConcreteRegisterImpl::max_kpr = ConcreteRegisterImpl::max_xmm + + KRegisterImpl::max_slots_per_register * KRegisterImpl::number_of_registers; + const char* RegisterImpl::name() const { const char* names[number_of_registers] = { #ifndef AMD64 @@ -59,8 +61,17 @@ const char* names[number_of_registers] = { "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7" #ifdef AMD64 - ,"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" + ,"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" + ,"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23" + ,"xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31" #endif // AMD64 }; return is_valid() ? names[encoding()] : "xnoreg"; } + +const char* KRegisterImpl::name() const { + const char* names[number_of_registers] = { + "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7" + }; + return is_valid() ? names[encoding()] : "knoreg"; +} --- old/src/cpu/x86/vm/register_x86.hpp 2015-04-23 08:26:02.505343300 -0700 +++ new/src/cpu/x86/vm/register_x86.hpp 2015-04-23 08:26:02.298343300 -0700 @@ -45,10 +45,12 @@ enum { #ifndef AMD64 number_of_registers = 8, - number_of_byte_registers = 4 + number_of_byte_registers = 4, + max_slots_per_register = 1 #else number_of_registers = 16, - number_of_byte_registers = 16 + number_of_byte_registers = 16, + max_slots_per_register = 1 #endif // AMD64 }; @@ -143,9 +145,11 @@ public: enum { #ifndef AMD64 - number_of_registers = 8 + number_of_registers = 8, + max_slots_per_register = 16 // 512-bit #else - number_of_registers = 16 + number_of_registers = 32, + max_slots_per_register = 16 // 512-bit #endif // AMD64 }; @@ -183,6 +187,22 @@ CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm13, (13)); CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm14, (14)); CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm15, (15)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm16, (16)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm17, (17)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm18, (18)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm19, (19)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm20, (20)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm21, (21)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm22, (22)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm23, (23)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm24, (24)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm25, (25)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm26, (26)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm27, (27)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm28, (28)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm29, (29)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm30, (30)); +CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm31, (31)); #endif // AMD64 // Only used by the 32bit stubGenerator. These can't be described by vmreg and hence @@ -200,6 +220,46 @@ CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx6 , ( 6)); CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx7 , ( 7)); +// Use XMMRegister as shortcut +class KRegisterImpl; +typedef KRegisterImpl* KRegister; + +inline KRegister as_KRegister(int encoding) { + return (KRegister)(intptr_t)encoding; +} + +// The implementation of XMM registers for the IA32 architecture +class KRegisterImpl : public AbstractRegisterImpl { +public: + enum { + number_of_registers = 8, + max_slots_per_register = 1 + }; + + // construction + friend KRegister as_KRegister(int encoding); + + inline VMReg as_VMReg(); + + // derived registers, offsets, and addresses + KRegister successor() const { return as_KRegister(encoding() + 1); } + + // accessors + int encoding() const { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this)); return (intptr_t)this; } + bool is_valid() const { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; } + const char* name() const; +}; + +// The Mask registers, for AVX3 enabled and up chips +CONSTANT_REGISTER_DECLARATION(KRegister, knoreg, (-1)); +CONSTANT_REGISTER_DECLARATION(KRegister, k0, (0)); +CONSTANT_REGISTER_DECLARATION(KRegister, k1, (1)); +CONSTANT_REGISTER_DECLARATION(KRegister, k2, (2)); +CONSTANT_REGISTER_DECLARATION(KRegister, k3, (3)); +CONSTANT_REGISTER_DECLARATION(KRegister, k4, (4)); +CONSTANT_REGISTER_DECLARATION(KRegister, k5, (5)); +CONSTANT_REGISTER_DECLARATION(KRegister, k6, (6)); +CONSTANT_REGISTER_DECLARATION(KRegister, k7, (7)); // Need to know the total number of registers of all sorts for SharedInfo. // Define a class that exports it. @@ -211,18 +271,20 @@ // There is no requirement that any ordering here matches any ordering c2 gives // it's optoregs. - number_of_registers = RegisterImpl::number_of_registers + + number_of_registers = RegisterImpl::number_of_registers + #ifdef AMD64 - RegisterImpl::number_of_registers + // "H" half of a 64bit register + RegisterImpl::number_of_registers + // "H" half of a 64bit register #endif // AMD64 - 2 * FloatRegisterImpl::number_of_registers + - 8 * XMMRegisterImpl::number_of_registers + - 1 // eflags + 2 * FloatRegisterImpl::number_of_registers + + XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers + + KRegisterImpl::number_of_registers + // mask registers + 1 // eflags }; static const int max_gpr; static const int max_fpr; static const int max_xmm; + static const int max_kpr; }; --- old/src/cpu/x86/vm/sharedRuntime_x86_32.cpp 2015-04-23 08:26:06.412343300 -0700 +++ new/src/cpu/x86/vm/sharedRuntime_x86_32.cpp 2015-04-23 08:26:06.193343300 -0700 @@ -117,9 +117,9 @@ int vect_words = 0; #ifdef COMPILER2 if (save_vectors) { - assert(UseAVX > 0, "256bit vectors are supported only with AVX"); - assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); - // Save upper half of YMM registes + assert(UseAVX > 0, "512bit vectors are supported only with EVEX"); + assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); + // Save upper half of ZMM/YMM registers : vect_words = 8 * 16 / wordSize; additional_frame_words += vect_words; } @@ -216,6 +216,17 @@ __ vextractf128h(Address(rsp, 80),xmm5); __ vextractf128h(Address(rsp, 96),xmm6); __ vextractf128h(Address(rsp,112),xmm7); + if (UseAVX > 2) { + __ subptr(rsp, 256); // Save upper half of ZMM registes + __ vextractf64x4h(Address(rsp, 0), xmm0); + __ vextractf64x4h(Address(rsp, 32), xmm1); + __ vextractf64x4h(Address(rsp, 64), xmm2); + __ vextractf64x4h(Address(rsp, 96), xmm3); + __ vextractf64x4h(Address(rsp, 128), xmm4); + __ vextractf64x4h(Address(rsp, 160), xmm5); + __ vextractf64x4h(Address(rsp, 192), xmm6); + __ vextractf64x4h(Address(rsp, 224), xmm7); + } } // Set an oopmap for the call site. This oopmap will map all @@ -283,8 +294,8 @@ int additional_frame_bytes = 0; #ifdef COMPILER2 if (restore_vectors) { - assert(UseAVX > 0, "256bit vectors are supported only with AVX"); - assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); + assert(UseAVX > 0, "512bit vectors are supported only with EVEX"); + assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); additional_frame_bytes = 128; } #else @@ -324,6 +335,18 @@ __ vinsertf128h(xmm6, Address(rsp, 96)); __ vinsertf128h(xmm7, Address(rsp,112)); __ addptr(rsp, additional_frame_bytes); + if (UseAVX > 2) { + additional_frame_bytes = 256; + __ vinsertf64x4h(xmm0, Address(rsp, 0)); + __ vinsertf64x4h(xmm1, Address(rsp, 32)); + __ vinsertf64x4h(xmm2, Address(rsp, 64)); + __ vinsertf64x4h(xmm3, Address(rsp, 96)); + __ vinsertf64x4h(xmm4, Address(rsp, 128)); + __ vinsertf64x4h(xmm5, Address(rsp, 160)); + __ vinsertf64x4h(xmm6, Address(rsp, 192)); + __ vinsertf64x4h(xmm7, Address(rsp, 224)); + __ addptr(rsp, additional_frame_bytes); + } } __ pop_FPU_state(); __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers --- old/src/cpu/x86/vm/sharedRuntime_x86_64.cpp 2015-04-23 08:26:09.841343300 -0700 +++ new/src/cpu/x86/vm/sharedRuntime_x86_64.cpp 2015-04-23 08:26:09.644343300 -0700 @@ -86,7 +86,23 @@ DEF_XMM_OFFS(13), DEF_XMM_OFFS(14), DEF_XMM_OFFS(15), - fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), + DEF_XMM_OFFS(16), + DEF_XMM_OFFS(17), + DEF_XMM_OFFS(18), + DEF_XMM_OFFS(19), + DEF_XMM_OFFS(20), + DEF_XMM_OFFS(21), + DEF_XMM_OFFS(22), + DEF_XMM_OFFS(23), + DEF_XMM_OFFS(24), + DEF_XMM_OFFS(25), + DEF_XMM_OFFS(26), + DEF_XMM_OFFS(27), + DEF_XMM_OFFS(28), + DEF_XMM_OFFS(29), + DEF_XMM_OFFS(30), + DEF_XMM_OFFS(31), + fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt), fpu_stateH_end, r15_off, r15H_off, r14_off, r14H_off, @@ -136,13 +152,21 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { int vect_words = 0; + int num_xmm_regs = 16; + if (UseAVX > 2) { + num_xmm_regs = 32; + } #ifdef COMPILER2 if (save_vectors) { - assert(UseAVX > 0, "256bit vectors are supported only with AVX"); - assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); - // Save upper half of YMM registes - vect_words = 16 * 16 / wordSize; + assert(UseAVX > 0, "512bit vectors are supported only with EVEX"); + assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); + // Save upper half of YMM registers + vect_words = 16 * num_xmm_regs / wordSize; additional_frame_words += vect_words; + if (UseAVX > 2) { + // Save upper half of ZMM registers as well + additional_frame_words += vect_words; + } } #else assert(!save_vectors, "vectors are generated only by C2"); @@ -150,7 +174,7 @@ // Always make the frame size 16-byte aligned int frame_size_in_bytes = round_to(additional_frame_words*wordSize + - reg_save_size*BytesPerInt, 16); + reg_save_size*BytesPerInt, num_xmm_regs); // OopMap frame size is in compiler stack slots (jint's) not bytes or words int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; // The caller will allocate additional_frame_words @@ -169,24 +193,77 @@ __ push_CPU_state(); // Push a multiple of 16 bytes if (vect_words > 0) { - assert(vect_words*wordSize == 256, ""); - __ subptr(rsp, 256); // Save upper half of YMM registes - __ vextractf128h(Address(rsp, 0),xmm0); - __ vextractf128h(Address(rsp, 16),xmm1); - __ vextractf128h(Address(rsp, 32),xmm2); - __ vextractf128h(Address(rsp, 48),xmm3); - __ vextractf128h(Address(rsp, 64),xmm4); - __ vextractf128h(Address(rsp, 80),xmm5); - __ vextractf128h(Address(rsp, 96),xmm6); - __ vextractf128h(Address(rsp,112),xmm7); - __ vextractf128h(Address(rsp,128),xmm8); - __ vextractf128h(Address(rsp,144),xmm9); - __ vextractf128h(Address(rsp,160),xmm10); - __ vextractf128h(Address(rsp,176),xmm11); - __ vextractf128h(Address(rsp,192),xmm12); - __ vextractf128h(Address(rsp,208),xmm13); - __ vextractf128h(Address(rsp,224),xmm14); - __ vextractf128h(Address(rsp,240),xmm15); + assert(vect_words*wordSize >= 256, ""); + __ subptr(rsp, 256); // Save upper half of YMM registes(0..15) + __ vextractf128h(Address(rsp, 0), xmm0); + __ vextractf128h(Address(rsp, 16), xmm1); + __ vextractf128h(Address(rsp, 32), xmm2); + __ vextractf128h(Address(rsp, 48), xmm3); + __ vextractf128h(Address(rsp, 64), xmm4); + __ vextractf128h(Address(rsp, 80), xmm5); + __ vextractf128h(Address(rsp, 96), xmm6); + __ vextractf128h(Address(rsp, 112), xmm7); + __ vextractf128h(Address(rsp, 128), xmm8); + __ vextractf128h(Address(rsp, 144), xmm9); + __ vextractf128h(Address(rsp, 160), xmm10); + __ vextractf128h(Address(rsp, 176), xmm11); + __ vextractf128h(Address(rsp, 192), xmm12); + __ vextractf128h(Address(rsp, 208), xmm13); + __ vextractf128h(Address(rsp, 224), xmm14); + __ vextractf128h(Address(rsp, 240), xmm15); + if (UseAVX > 2) { + __ subptr(rsp, 256); // Save upper half of YMM registes(16..31) + __ vextractf128h(Address(rsp, 0), xmm16); + __ vextractf128h(Address(rsp, 16), xmm17); + __ vextractf128h(Address(rsp, 32), xmm18); + __ vextractf128h(Address(rsp, 48), xmm19); + __ vextractf128h(Address(rsp, 64), xmm20); + __ vextractf128h(Address(rsp, 80), xmm21); + __ vextractf128h(Address(rsp, 96), xmm22); + __ vextractf128h(Address(rsp, 112), xmm23); + __ vextractf128h(Address(rsp, 128), xmm24); + __ vextractf128h(Address(rsp, 144), xmm25); + __ vextractf128h(Address(rsp, 160), xmm26); + __ vextractf128h(Address(rsp, 176), xmm27); + __ vextractf128h(Address(rsp, 192), xmm28); + __ vextractf128h(Address(rsp, 208), xmm29); + __ vextractf128h(Address(rsp, 224), xmm30); + __ vextractf128h(Address(rsp, 240), xmm31); + // Now handle the ZMM registers (0..31) + __ subptr(rsp, 1024); // Save upper half of ZMM registes + __ vextractf64x4h(Address(rsp, 0), xmm0); + __ vextractf64x4h(Address(rsp, 32), xmm1); + __ vextractf64x4h(Address(rsp, 64), xmm2); + __ vextractf64x4h(Address(rsp, 96), xmm3); + __ vextractf64x4h(Address(rsp, 128), xmm4); + __ vextractf64x4h(Address(rsp, 160), xmm5); + __ vextractf64x4h(Address(rsp, 192), xmm6); + __ vextractf64x4h(Address(rsp, 224), xmm7); + __ vextractf64x4h(Address(rsp, 256), xmm8); + __ vextractf64x4h(Address(rsp, 288), xmm9); + __ vextractf64x4h(Address(rsp, 320), xmm10); + __ vextractf64x4h(Address(rsp, 352), xmm11); + __ vextractf64x4h(Address(rsp, 384), xmm12); + __ vextractf64x4h(Address(rsp, 416), xmm13); + __ vextractf64x4h(Address(rsp, 448), xmm14); + __ vextractf64x4h(Address(rsp, 480), xmm15); + __ vextractf64x4h(Address(rsp, 512), xmm16); + __ vextractf64x4h(Address(rsp, 544), xmm17); + __ vextractf64x4h(Address(rsp, 576), xmm18); + __ vextractf64x4h(Address(rsp, 608), xmm19); + __ vextractf64x4h(Address(rsp, 640), xmm20); + __ vextractf64x4h(Address(rsp, 672), xmm21); + __ vextractf64x4h(Address(rsp, 704), xmm22); + __ vextractf64x4h(Address(rsp, 736), xmm23); + __ vextractf64x4h(Address(rsp, 768), xmm24); + __ vextractf64x4h(Address(rsp, 800), xmm25); + __ vextractf64x4h(Address(rsp, 832), xmm26); + __ vextractf64x4h(Address(rsp, 864), xmm27); + __ vextractf64x4h(Address(rsp, 896), xmm28); + __ vextractf64x4h(Address(rsp, 928), xmm29); + __ vextractf64x4h(Address(rsp, 960), xmm30); + __ vextractf64x4h(Address(rsp, 992), xmm31); + } } if (frame::arg_reg_save_area_bytes != 0) { // Allocate argument register save area @@ -235,6 +312,24 @@ map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg()); map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg()); map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg()); + if (UseAVX > 2) { + map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg()); + } // %%% These should all be a waste but we'll keep things as they were for now if (true) { @@ -269,6 +364,24 @@ map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next()); map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next()); map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next()); + if (UseAVX > 2) { + map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()); + } } return map; @@ -281,9 +394,9 @@ } #ifdef COMPILER2 if (restore_vectors) { - // Restore upper half of YMM registes. - assert(UseAVX > 0, "256bit vectors are supported only with AVX"); - assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); + // Restore upper half of YMM registes (0..15) + assert(UseAVX > 0, "512bit vectors are supported only with AVX"); + assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); __ vinsertf128h(xmm0, Address(rsp, 0)); __ vinsertf128h(xmm1, Address(rsp, 16)); __ vinsertf128h(xmm2, Address(rsp, 32)); @@ -301,6 +414,60 @@ __ vinsertf128h(xmm14, Address(rsp,224)); __ vinsertf128h(xmm15, Address(rsp,240)); __ addptr(rsp, 256); + if (UseAVX > 2) { + // Restore upper half of YMM registes (16..31) + __ vinsertf128h(xmm16, Address(rsp, 0)); + __ vinsertf128h(xmm17, Address(rsp, 16)); + __ vinsertf128h(xmm18, Address(rsp, 32)); + __ vinsertf128h(xmm19, Address(rsp, 48)); + __ vinsertf128h(xmm20, Address(rsp, 64)); + __ vinsertf128h(xmm21, Address(rsp, 80)); + __ vinsertf128h(xmm22, Address(rsp, 96)); + __ vinsertf128h(xmm23, Address(rsp,112)); + __ vinsertf128h(xmm24, Address(rsp,128)); + __ vinsertf128h(xmm25, Address(rsp,144)); + __ vinsertf128h(xmm26, Address(rsp,160)); + __ vinsertf128h(xmm27, Address(rsp,176)); + __ vinsertf128h(xmm28, Address(rsp,192)); + __ vinsertf128h(xmm29, Address(rsp,208)); + __ vinsertf128h(xmm30, Address(rsp,224)); + __ vinsertf128h(xmm31, Address(rsp,240)); + __ addptr(rsp, 256); + // Restore upper half of ZMM registes. + __ vinsertf64x4h(xmm0, Address(rsp, 0)); + __ vinsertf64x4h(xmm1, Address(rsp, 32)); + __ vinsertf64x4h(xmm2, Address(rsp, 64)); + __ vinsertf64x4h(xmm3, Address(rsp, 96)); + __ vinsertf64x4h(xmm4, Address(rsp, 128)); + __ vinsertf64x4h(xmm5, Address(rsp, 160)); + __ vinsertf64x4h(xmm6, Address(rsp, 192)); + __ vinsertf64x4h(xmm7, Address(rsp, 224)); + __ vinsertf64x4h(xmm8, Address(rsp, 256)); + __ vinsertf64x4h(xmm9, Address(rsp, 288)); + __ vinsertf64x4h(xmm10, Address(rsp, 320)); + __ vinsertf64x4h(xmm11, Address(rsp, 352)); + __ vinsertf64x4h(xmm12, Address(rsp, 384)); + __ vinsertf64x4h(xmm13, Address(rsp, 416)); + __ vinsertf64x4h(xmm14, Address(rsp, 448)); + __ vinsertf64x4h(xmm15, Address(rsp, 480)); + __ vinsertf64x4h(xmm16, Address(rsp, 512)); + __ vinsertf64x4h(xmm17, Address(rsp, 544)); + __ vinsertf64x4h(xmm18, Address(rsp, 576)); + __ vinsertf64x4h(xmm19, Address(rsp, 608)); + __ vinsertf64x4h(xmm20, Address(rsp, 640)); + __ vinsertf64x4h(xmm21, Address(rsp, 672)); + __ vinsertf64x4h(xmm22, Address(rsp, 704)); + __ vinsertf64x4h(xmm23, Address(rsp, 736)); + __ vinsertf64x4h(xmm24, Address(rsp, 768)); + __ vinsertf64x4h(xmm25, Address(rsp, 800)); + __ vinsertf64x4h(xmm26, Address(rsp, 832)); + __ vinsertf64x4h(xmm27, Address(rsp, 864)); + __ vinsertf64x4h(xmm28, Address(rsp, 896)); + __ vinsertf64x4h(xmm29, Address(rsp, 928)); + __ vinsertf64x4h(xmm30, Address(rsp, 960)); + __ vinsertf64x4h(xmm31, Address(rsp, 992)); + __ subptr(rsp, 1024); + } } #else assert(!restore_vectors, "vectors are generated only by C2"); --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-04-23 08:26:13.346343300 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-04-23 08:26:13.151343300 -0700 @@ -166,6 +166,13 @@ __ movptr(saved_rdi, rdi); __ movptr(saved_rsi, rsi); __ movptr(saved_rbx, rbx); + + // provide initial value for required masks + if (UseAVX > 2) { + __ movl(rbx, 0xffff); + __ kmovdl(k1, rbx); + } + // save and initialize %mxcsr if (sse_save) { Label skip_ldmx; @@ -794,7 +801,10 @@ __ BIND(L_copy_64_bytes_loop); if (UseUnalignedLoadStores) { - if (UseAVX >= 2) { + if (UseAVX > 2) { + __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit); + __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit); + } else if (UseAVX == 2) { __ vmovdqu(xmm0, Address(from, 0)); __ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0); __ vmovdqu(xmm1, Address(from, 32)); @@ -833,7 +843,7 @@ __ subl(qword_count, 8); __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); - if (UseUnalignedLoadStores && (UseAVX >= 2)) { + if (UseUnalignedLoadStores && (UseAVX == 2)) { // clean upper bits of YMM registers __ vzeroupper(); } --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-04-23 08:26:16.822343300 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-04-23 08:26:16.628343300 -0700 @@ -137,8 +137,10 @@ // [ return_from_Java ] <--- rsp // [ argument word n ] // ... - // -28 [ argument word 1 ] - // -27 [ saved xmm15 ] <--- rsp_after_call + // -60 [ argument word 1 ] + // -59 [ saved xmm31 ] <--- rsp after_call + // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank) + // -27 [ saved xmm15 ] // [ saved xmm7-xmm14 ] // -9 [ saved xmm6 ] (each xmm register takes 2 slots) // -7 [ saved r15 ] @@ -166,7 +168,7 @@ enum call_stub_layout { #ifdef _WIN64 xmm_save_first = 6, // save from xmm6 - xmm_save_last = 15, // to xmm15 + xmm_save_last = 31, // to xmm31 xmm_save_base = -9, rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27 r15_off = -7, @@ -262,9 +264,19 @@ __ movptr(r13_save, r13); __ movptr(r14_save, r14); __ movptr(r15_save, r15); + if (UseAVX > 2) { + __ movl(rbx, 0xffff); + __ kmovql(k1, rbx); + } #ifdef _WIN64 - for (int i = 6; i <= 15; i++) { - __ movdqu(xmm_save(i), as_XMMRegister(i)); + if (UseAVX > 2) { + for (int i = 6; i <= 31; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } + } else { + for (int i = 6; i <= 15; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } } const Address rdi_save(rbp, rdi_off * wordSize); @@ -1318,7 +1330,10 @@ Label L_end; // Copy 64-bytes per iteration __ BIND(L_loop); - if (UseAVX >= 2) { + if (UseAVX > 2) { + __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); + __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); + } else if (UseAVX == 2) { __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); @@ -1394,7 +1409,10 @@ Label L_end; // Copy 64-bytes per iteration __ BIND(L_loop); - if (UseAVX >= 2) { + if (UseAVX > 2) { + __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit); + __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit); + } else if (UseAVX == 2) { __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); --- old/src/cpu/x86/vm/vm_version_x86.cpp 2015-04-23 08:26:20.292343300 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2015-04-23 08:26:20.072343300 -0700 @@ -35,7 +35,7 @@ int VM_Version::_cpu; int VM_Version::_model; int VM_Version::_stepping; -int VM_Version::_cpuFeatures; +uint64_t VM_Version::_cpuFeatures; const char* VM_Version::_features_str = ""; VM_Version::CpuidInfo VM_Version::_cpuid_info = { 0, }; @@ -45,7 +45,7 @@ address VM_Version::_cpuinfo_cont_addr = 0; static BufferBlob* stub_blob; -static const int stub_size = 600; +static const int stub_size = 1000; extern "C" { typedef void (*get_cpu_info_stub_t)(void*); @@ -60,15 +60,16 @@ address generate_get_cpu_info() { // Flags to test CPU type. - const uint32_t HS_EFL_AC = 0x40000; - const uint32_t HS_EFL_ID = 0x200000; + const uint32_t HS_EFL_AC = 0x40000; + const uint32_t HS_EFL_ID = 0x200000; // Values for when we don't have a CPUID instruction. const int CPU_FAMILY_SHIFT = 8; - const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT); - const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT); + const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT); + const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT); Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4; - Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done; + Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done, wrapup; + Label legacy_setup, save_restore_except, legacy_save_restore, start_simd_check; StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub"); # define __ _masm-> @@ -241,53 +242,6 @@ __ movl(Address(rsi, 0), rax); __ movl(Address(rsi, 4), rdx); - __ andl(rax, 0x6); // xcr0 bits sse | ymm - __ cmpl(rax, 0x6); - __ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported - - // - // Some OSs have a bug when upper 128bits of YMM - // registers are not restored after a signal processing. - // Generate SEGV here (reference through NULL) - // and check upper YMM bits after it. - // - VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts - intx saved_useavx = UseAVX; - intx saved_usesse = UseSSE; - UseAVX = 1; - UseSSE = 2; - - // load value into all 32 bytes of ymm7 register - __ movl(rcx, VM_Version::ymm_test_value()); - - __ movdl(xmm0, rcx); - __ pshufd(xmm0, xmm0, 0x00); - __ vinsertf128h(xmm0, xmm0, xmm0); - __ vmovdqu(xmm7, xmm0); -#ifdef _LP64 - __ vmovdqu(xmm8, xmm0); - __ vmovdqu(xmm15, xmm0); -#endif - - __ xorl(rsi, rsi); - VM_Version::set_cpuinfo_segv_addr( __ pc() ); - // Generate SEGV - __ movl(rax, Address(rsi, 0)); - - VM_Version::set_cpuinfo_cont_addr( __ pc() ); - // Returns here after signal. Save xmm0 to check it later. - __ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset()))); - __ vmovdqu(Address(rsi, 0), xmm0); - __ vmovdqu(Address(rsi, 32), xmm7); -#ifdef _LP64 - __ vmovdqu(Address(rsi, 64), xmm8); - __ vmovdqu(Address(rsi, 96), xmm15); -#endif - - VM_Version::clean_cpuFeatures(); - UseAVX = saved_useavx; - UseSSE = saved_usesse; - // // cpuid(0x7) Structured Extended Features // @@ -364,9 +318,143 @@ __ movl(Address(rsi,12), rdx); // - // return + // Check if OS has enabled XGETBV instruction to access XCR0 + // (OSXSAVE feature flag) and CPU supports AVX // + __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset()))); + __ movl(rcx, 0x18000000); // cpuid1 bits osxsave | avx + __ andl(rcx, Address(rsi, 8)); // cpuid1 bits osxsave | avx + __ cmpl(rcx, 0x18000000); + __ jccb(Assembler::notEqual, done); // jump if AVX is not supported + + __ movl(rax, 0x6); + __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm + __ cmpl(rax, 0x6); + __ jccb(Assembler::equal, start_simd_check); // return if AVX is not supported + + // we need to bridge farther than imm8, so we use this island as a thunk __ bind(done); + __ jmp(wrapup); + + __ bind(start_simd_check); + // + // Some OSs have a bug when upper 128/256bits of YMM/ZMM + // registers are not restored after a signal processing. + // Generate SEGV here (reference through NULL) + // and check upper YMM/ZMM bits after it. + // + intx saved_useavx = UseAVX; + intx saved_usesse = UseSSE; + // check _cpuid_info.sef_cpuid7_ebx.bits.avx512f + __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); + __ movl(rax, 0x10000); + __ andl(rax, Address(rsi, 4)); // xcr0 bits sse | ymm + __ cmpl(rax, 0x10000); + __ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported + // check _cpuid_info.xem_xcr0_eax.bits.opmask + // check _cpuid_info.xem_xcr0_eax.bits.zmm512 + // check _cpuid_info.xem_xcr0_eax.bits.zmm32 + __ movl(rax, 0xE0); + __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm + __ cmpl(rax, 0xE0); + __ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported + + // EVEX setup: run in lowest evex mode + VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts + UseAVX = 3; + UseSSE = 2; + // load value into all 64 bytes of zmm7 register + __ movl(rcx, VM_Version::ymm_test_value()); + __ movdl(xmm0, rcx); + __ movl(rcx, 0xffff); +#ifdef _LP64 + __ kmovql(k1, rcx); +#else + __ kmovdl(k1, rcx); +#endif + __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit); + __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit); +#ifdef _LP64 + __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit); + __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit); +#endif + VM_Version::clean_cpuFeatures(); + __ jmp(save_restore_except); + + __ bind(legacy_setup); + // AVX setup + VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts + UseAVX = 1; + UseSSE = 2; + // load value into all 32 bytes of ymm7 register + __ movl(rcx, VM_Version::ymm_test_value()); + + __ movdl(xmm0, rcx); + __ pshufd(xmm0, xmm0, 0x00); + __ vinsertf128h(xmm0, xmm0, xmm0); + __ vmovdqu(xmm7, xmm0); +#ifdef _LP64 + __ vmovdqu(xmm8, xmm0); + __ vmovdqu(xmm15, xmm0); +#endif + VM_Version::clean_cpuFeatures(); + + __ bind(save_restore_except); + __ xorl(rsi, rsi); + VM_Version::set_cpuinfo_segv_addr(__ pc()); + // Generate SEGV + __ movl(rax, Address(rsi, 0)); + + VM_Version::set_cpuinfo_cont_addr(__ pc()); + // Returns here after signal. Save xmm0 to check it later. + + // check _cpuid_info.sef_cpuid7_ebx.bits.avx512f + __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); + __ movl(rax, 0x10000); + __ andl(rax, Address(rsi, 4)); + __ cmpl(rax, 0x10000); + __ jccb(Assembler::notEqual, legacy_save_restore); + // check _cpuid_info.xem_xcr0_eax.bits.opmask + // check _cpuid_info.xem_xcr0_eax.bits.zmm512 + // check _cpuid_info.xem_xcr0_eax.bits.zmm32 + __ movl(rax, 0xE0); + __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm + __ cmpl(rax, 0xE0); + __ jccb(Assembler::notEqual, legacy_save_restore); + + // EVEX check: run in lowest evex mode + VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts + UseAVX = 3; + UseSSE = 2; + __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset()))); + __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit); + __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit); +#ifdef _LP64 + __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit); + __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit); +#endif + VM_Version::clean_cpuFeatures(); + UseAVX = saved_useavx; + UseSSE = saved_usesse; + __ jmp(wrapup); + + __ bind(legacy_save_restore); + // AVX check + VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts + UseAVX = 1; + UseSSE = 2; + __ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset()))); + __ vmovdqu(Address(rsi, 0), xmm0); + __ vmovdqu(Address(rsi, 32), xmm7); +#ifdef _LP64 + __ vmovdqu(Address(rsi, 64), xmm8); + __ vmovdqu(Address(rsi, 96), xmm15); +#endif + VM_Version::clean_cpuFeatures(); + UseAVX = saved_useavx; + UseSSE = saved_usesse; + + __ bind(wrapup); __ popf(); __ pop(rsi); __ pop(rbx); @@ -470,6 +558,29 @@ if (UseSSE < 1) _cpuFeatures &= ~CPU_SSE; + // first try initial setting and detect what we can support + if (UseAVX > 0) { + if (UseAVX > 2 && supports_evex()) { + UseAVX = 3; + } else if (UseAVX > 1 && supports_avx2()) { + UseAVX = 2; + } else if (UseAVX > 0 && supports_avx()) { + UseAVX = 1; + } else { + UseAVX = 0; + } + } else if (UseAVX < 0) { + UseAVX = 0; + } + + if (UseAVX < 3) { + _cpuFeatures &= ~CPU_AVX512F; + _cpuFeatures &= ~CPU_AVX512DQ; + _cpuFeatures &= ~CPU_AVX512CD; + _cpuFeatures &= ~CPU_AVX512BW; + _cpuFeatures &= ~CPU_AVX512VL; + } + if (UseAVX < 2) _cpuFeatures &= ~CPU_AVX2; @@ -485,7 +596,7 @@ } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -515,7 +626,8 @@ (supports_tscinv() ? ", tscinv": ""), (supports_bmi1() ? ", bmi1" : ""), (supports_bmi2() ? ", bmi2" : ""), - (supports_adx() ? ", adx" : "")); + (supports_adx() ? ", adx" : ""), + (supports_evex() ? ", evex" : "")); _features_str = os::strdup(buf); // UseSSE is set to the smaller of what hardware supports and what @@ -532,13 +644,6 @@ if (!supports_sse ()) // Drop to 0 if no SSE support UseSSE = 0; - if (UseAVX > 2) UseAVX=2; - if (UseAVX < 0) UseAVX=0; - if (!supports_avx2()) // Drop to 1 if no AVX2 support - UseAVX = MIN2((intx)1,UseAVX); - if (!supports_avx ()) // Drop to 0 if no AVX support - UseAVX = 0; - // Use AES instructions if available. if (supports_aes()) { if (FLAG_IS_DEFAULT(UseAES)) { @@ -609,7 +714,8 @@ if ((_model == CPU_MODEL_HASWELL_E3) || (_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) || (_model == CPU_MODEL_BROADWELL && _stepping < 4)) { - if (!UnlockExperimentalVMOptions) { + // currently a collision between SKL and HSW_E3 + if (!UnlockExperimentalVMOptions && UseAVX < 3) { vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag."); } else { warning("UseRTMLocking is only available as experimental option on this platform."); @@ -662,10 +768,10 @@ if (MaxVectorSize > 0) { if (!is_power_of_2(MaxVectorSize)) { warning("MaxVectorSize must be a power of 2"); - FLAG_SET_DEFAULT(MaxVectorSize, 32); + FLAG_SET_DEFAULT(MaxVectorSize, 64); } - if (MaxVectorSize > 32) { - FLAG_SET_DEFAULT(MaxVectorSize, 32); + if (MaxVectorSize > 64) { + FLAG_SET_DEFAULT(MaxVectorSize, 64); } if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) { // 32 bytes vectors (in YMM) are only supported with AVX+ --- old/src/cpu/x86/vm/vm_version_x86.hpp 2015-04-23 08:26:23.868343300 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.hpp 2015-04-23 08:26:23.671343300 -0700 @@ -208,20 +208,33 @@ bmi2 : 1, erms : 1, : 1, - rtm : 1, - : 7, - adx : 1, - : 12; + rtm : 1, + : 4, + avx512f : 1, + avx512dq : 1, + : 1, + adx : 1, + : 6, + avx512pf : 1, + avx512er : 1, + avx512cd : 1, + : 1, + avx512bw : 1, + avx512vl : 1; } bits; }; union XemXcr0Eax { uint32_t value; struct { - uint32_t x87 : 1, - sse : 1, - ymm : 1, - : 29; + uint32_t x87 : 1, + sse : 1, + ymm : 1, + : 2, + opmask : 1, + zmm512 : 1, + zmm32 : 1, + : 24; } bits; }; @@ -229,43 +242,51 @@ static int _cpu; static int _model; static int _stepping; - static int _cpuFeatures; // features returned by the "cpuid" instruction - // 0 if this instruction is not available + static uint64_t _cpuFeatures; // features returned by the "cpuid" instruction + // 0 if this instruction is not available static const char* _features_str; static address _cpuinfo_segv_addr; // address of instruction which causes SEGV static address _cpuinfo_cont_addr; // address of instruction after the one which causes SEGV enum { - CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX) - CPU_CMOV = (1 << 1), - CPU_FXSR = (1 << 2), - CPU_HT = (1 << 3), - CPU_MMX = (1 << 4), - CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions - // may not necessarily support other 3dnow instructions - CPU_SSE = (1 << 6), - CPU_SSE2 = (1 << 7), - CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX) - CPU_SSSE3 = (1 << 9), - CPU_SSE4A = (1 << 10), - CPU_SSE4_1 = (1 << 11), - CPU_SSE4_2 = (1 << 12), - CPU_POPCNT = (1 << 13), - CPU_LZCNT = (1 << 14), - CPU_TSC = (1 << 15), - CPU_TSCINV = (1 << 16), - CPU_AVX = (1 << 17), - CPU_AVX2 = (1 << 18), - CPU_AES = (1 << 19), - CPU_ERMS = (1 << 20), // enhanced 'rep movsb/stosb' instructions - CPU_CLMUL = (1 << 21), // carryless multiply for CRC - CPU_BMI1 = (1 << 22), - CPU_BMI2 = (1 << 23), - CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions - CPU_ADX = (1 << 25) + CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX) + CPU_CMOV = (1 << 1), + CPU_FXSR = (1 << 2), + CPU_HT = (1 << 3), + CPU_MMX = (1 << 4), + CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions + // may not necessarily support other 3dnow instructions + CPU_SSE = (1 << 6), + CPU_SSE2 = (1 << 7), + CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX) + CPU_SSSE3 = (1 << 9), + CPU_SSE4A = (1 << 10), + CPU_SSE4_1 = (1 << 11), + CPU_SSE4_2 = (1 << 12), + CPU_POPCNT = (1 << 13), + CPU_LZCNT = (1 << 14), + CPU_TSC = (1 << 15), + CPU_TSCINV = (1 << 16), + CPU_AVX = (1 << 17), + CPU_AVX2 = (1 << 18), + CPU_AES = (1 << 19), + CPU_ERMS = (1 << 20), // enhanced 'rep movsb/stosb' instructions + CPU_CLMUL = (1 << 21), // carryless multiply for CRC + CPU_BMI1 = (1 << 22), + CPU_BMI2 = (1 << 23), + CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions + CPU_ADX = (1 << 25), + CPU_AVX512F = (1 << 26), // AVX 512bit foundation instructions + CPU_AVX512DQ = (1 << 27), + CPU_AVX512PF = (1 << 28), + CPU_AVX512ER = (1 << 29), + CPU_AVX512CD = (1 << 30), + CPU_AVX512BW = (1 << 31) } cpuFeatureFlags; +#define CPU_AVX512VL 0x100000000 // EVEX instructions with smaller vector length : enums are limited to 32bit + enum { // AMD CPU_FAMILY_AMD_11H = 0x11, @@ -282,7 +303,8 @@ CPU_MODEL_IVYBRIDGE_EP = 0x3a, CPU_MODEL_HASWELL_E3 = 0x3c, CPU_MODEL_HASWELL_E7 = 0x3f, - CPU_MODEL_BROADWELL = 0x3d + CPU_MODEL_BROADWELL = 0x3d, + CPU_MODEL_SKYLAKE = CPU_MODEL_HASWELL_E3 } cpuExtendedFamily; // cpuid information block. All info derived from executing cpuid with @@ -376,6 +398,9 @@ // Space to save ymm registers after signal handle int ymm_save[8*4]; // Save ymm0, ymm7, ymm8, ymm15 + + // Space to save zmm registers after signal handle + int zmm_save[16*4]; // Save zmm0, zmm7, zmm8, zmm31 }; // The actual cpuid info block @@ -404,8 +429,8 @@ return result; } - static uint32_t feature_flags() { - uint32_t result = 0; + static uint64_t feature_flags() { + uint64_t result = 0; if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0) result |= CPU_CX8; if (_cpuid_info.std_cpuid1_edx.bits.cmov != 0) @@ -440,6 +465,24 @@ result |= CPU_AVX; if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0) result |= CPU_AVX2; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 && + _cpuid_info.xem_xcr0_eax.bits.opmask != 0 && + _cpuid_info.xem_xcr0_eax.bits.zmm512 != 0 && + _cpuid_info.xem_xcr0_eax.bits.zmm32 != 0) { + result |= CPU_AVX512F; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512cd != 0) + result |= CPU_AVX512CD; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0) + result |= CPU_AVX512DQ; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0) + result |= CPU_AVX512PF; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0) + result |= CPU_AVX512ER; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512bw != 0) + result |= CPU_AVX512BW; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0) + result |= CPU_AVX512VL; + } } if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) result |= CPU_BMI1; @@ -484,18 +527,31 @@ } static bool os_supports_avx_vectors() { - if (!supports_avx()) { - return false; - } - // Verify that OS save/restore all bits of AVX registers - // during signal processing. - int nreg = 2 LP64_ONLY(+2); - for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register - if (_cpuid_info.ymm_save[i] != ymm_test_value()) { - return false; + bool retVal = false; + if (supports_evex()) { + // Verify that OS save/restore all bits of EVEX registers + // during signal processing. + int nreg = 2 LP64_ONLY(+2); + retVal = true; + for (int i = 0; i < 16 * nreg; i++) { // 64 bytes per zmm register + if (_cpuid_info.zmm_save[i] != ymm_test_value()) { + retVal = false; + break; + } + } + } else if (supports_avx()) { + // Verify that OS save/restore all bits of AVX registers + // during signal processing. + int nreg = 2 LP64_ONLY(+2); + retVal = true; + for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register + if (_cpuid_info.ymm_save[i] != ymm_test_value()) { + retVal = false; + break; + } } } - return true; + return retVal; } static void get_processor_features(); @@ -515,6 +571,7 @@ static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); } static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); } static ByteSize ymm_save_offset() { return byte_offset_of(CpuidInfo, ymm_save); } + static ByteSize zmm_save_offset() { return byte_offset_of(CpuidInfo, zmm_save); } // The value used to check ymm register after signal handle static int ymm_test_value() { return 0xCAFEBABE; } @@ -527,6 +584,7 @@ static void clean_cpuFeatures() { _cpuFeatures = 0; } static void set_avx_cpuFeatures() { _cpuFeatures = (CPU_SSE | CPU_SSE2 | CPU_AVX); } + static void set_evex_cpuFeatures() { _cpuFeatures = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); } // Initialization @@ -636,7 +694,14 @@ static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; } static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; } static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; } - static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; } + static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; } + static bool supports_evex() { return (_cpuFeatures & CPU_AVX512F) != 0; } + static bool supports_avx512dq() { return (_cpuFeatures & CPU_AVX512DQ) != 0; } + static bool supports_avx512pf() { return (_cpuFeatures & CPU_AVX512PF) != 0; } + static bool supports_avx512er() { return (_cpuFeatures & CPU_AVX512ER) != 0; } + static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; } + static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; } + static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && extended_cpu_family() == CPU_FAMILY_INTEL_CORE; } --- old/src/cpu/x86/vm/vmreg_x86.cpp 2015-04-23 08:26:27.614343300 -0700 +++ new/src/cpu/x86/vm/vmreg_x86.cpp 2015-04-23 08:26:27.418343300 -0700 @@ -47,13 +47,22 @@ } XMMRegister xreg = ::as_XMMRegister(0); - for ( ; i < ConcreteRegisterImpl::max_xmm ; ) { - for (int j = 0 ; j < 8 ; j++) { + for (; i < ConcreteRegisterImpl::max_xmm;) { + for (int j = 0 ; j < XMMRegisterImpl::max_slots_per_register ; j++) { regName[i++] = xreg->name(); } xreg = xreg->successor(); } + + KRegister kreg = ::as_KRegister(0); + for (; i < ConcreteRegisterImpl::max_kpr;) { + for (int j = 0; j < KRegisterImpl::max_slots_per_register; j++) { + regName[i++] = kreg->name(); + } + kreg = kreg->successor(); + } + for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) { - regName[i] = "NON-GPR-FPR-XMM"; + regName[i] = "NON-GPR-FPR-XMM-KREG"; } } --- old/src/cpu/x86/vm/vmreg_x86.hpp 2015-04-23 08:26:31.655343300 -0700 +++ new/src/cpu/x86/vm/vmreg_x86.hpp 2015-04-23 08:26:31.460343300 -0700 @@ -36,7 +36,24 @@ } inline bool is_XMMRegister() { - return value() >= ConcreteRegisterImpl::max_fpr && value() < ConcreteRegisterImpl::max_xmm; + int uarch_max_xmm = ConcreteRegisterImpl::max_xmm; + +#ifdef _LP64 + if (UseAVX < 3) { + int half_xmm = (XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers) / 2; + uarch_max_xmm -= half_xmm; + } +#endif + + return (value() >= ConcreteRegisterImpl::max_fpr && value() < uarch_max_xmm); +} + +inline bool is_KRegister() { + if (UseAVX > 2) { + return value() >= ConcreteRegisterImpl::max_xmm && value() < ConcreteRegisterImpl::max_kpr; + } else { + return false; + } } inline Register as_Register() { @@ -59,7 +76,13 @@ inline XMMRegister as_XMMRegister() { assert( is_XMMRegister() && is_even(value()), "must be" ); // Yuk - return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3); + return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 4); +} + +inline KRegister as_KRegister() { + assert(is_KRegister(), "must be"); + // Yuk + return ::as_KRegister((value() - ConcreteRegisterImpl::max_xmm)); } inline bool is_concrete() { --- old/src/cpu/x86/vm/vmreg_x86.inline.hpp 2015-04-23 08:26:35.867343300 -0700 +++ new/src/cpu/x86/vm/vmreg_x86.inline.hpp 2015-04-23 08:26:35.670343300 -0700 @@ -39,7 +39,11 @@ } inline VMReg XMMRegisterImpl::as_VMReg() { - return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr); + return VMRegImpl::as_VMReg((encoding() << 4) + ConcreteRegisterImpl::max_fpr); +} + +inline VMReg KRegisterImpl::as_VMReg() { + return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_xmm); } #endif // CPU_X86_VM_VMREG_X86_INLINE_HPP --- old/src/cpu/x86/vm/x86.ad 2015-04-23 08:26:40.026343300 -0700 +++ new/src/cpu/x86/vm/x86.ad 2015-04-23 08:26:39.808343300 -0700 @@ -59,15 +59,19 @@ // // The encoding number is the actual bit-pattern placed into the opcodes. -// XMM registers. 256-bit registers or 8 words each, labeled (a)-h. +// XMM registers. 512-bit registers or 8 words each, labeled (a)-p. // Word a in each register holds a Float, words ab hold a Double. // The whole registers are used in SSE4.2 version intrinsics, // array copy stubs and superword operations (see UseSSE42Intrinsics, // UseXMMForArrayCopy and UseSuperword flags). -// XMM8-XMM15 must be encoded with REX (VEX for UseAVX). +// For pre EVEX enabled architectures: +// XMM8-XMM15 must be encoded with REX (VEX for UseAVX) +// For EVEX enabled architectures: +// XMM8-XMM31 must be encoded with REX (EVEX for UseAVX). +// // Linux ABI: No register preserved across function calls // XMM0-XMM7 might hold parameters -// Windows ABI: XMM6-XMM15 preserved across function calls +// Windows ABI: XMM6-XMM31 preserved across function calls // XMM0-XMM3 might hold parameters reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()); @@ -78,6 +82,14 @@ reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5)); reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6)); reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7)); +reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8)); +reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9)); +reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10)); +reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11)); +reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12)); +reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13)); +reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14)); +reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15)); reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()); reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1)); @@ -87,6 +99,14 @@ reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5)); reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6)); reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7)); +reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8)); +reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9)); +reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10)); +reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11)); +reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12)); +reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13)); +reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14)); +reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15)); reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()); reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1)); @@ -96,6 +116,14 @@ reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5)); reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6)); reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7)); +reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8)); +reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9)); +reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10)); +reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11)); +reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12)); +reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13)); +reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14)); +reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15)); reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()); reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1)); @@ -105,6 +133,14 @@ reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5)); reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6)); reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7)); +reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8)); +reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9)); +reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10)); +reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11)); +reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12)); +reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13)); +reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14)); +reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15)); reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()); reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1)); @@ -114,6 +150,14 @@ reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5)); reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6)); reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7)); +reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8)); +reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9)); +reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10)); +reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11)); +reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12)); +reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13)); +reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14)); +reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15)); reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()); reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1)); @@ -123,6 +167,14 @@ reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5)); reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6)); reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7)); +reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8)); +reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9)); +reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10)); +reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11)); +reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12)); +reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13)); +reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14)); +reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15)); #ifdef _WIN64 @@ -134,6 +186,14 @@ reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5)); reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6)); reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7)); +reg_def XMM6i( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(8)); +reg_def XMM6j( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(9)); +reg_def XMM6k( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(10)); +reg_def XMM6l( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(11)); +reg_def XMM6m( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(12)); +reg_def XMM6n( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(13)); +reg_def XMM6o( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(14)); +reg_def XMM6p( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(15)); reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()); reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1)); @@ -143,6 +203,14 @@ reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5)); reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6)); reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7)); +reg_def XMM7i( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(8)); +reg_def XMM7j( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(9)); +reg_def XMM7k( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(10)); +reg_def XMM7l( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(11)); +reg_def XMM7m( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(12)); +reg_def XMM7n( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(13)); +reg_def XMM7o( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(14)); +reg_def XMM7p( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(15)); reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()); reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1)); @@ -152,6 +220,14 @@ reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5)); reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6)); reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7)); +reg_def XMM8i( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(8)); +reg_def XMM8j( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(9)); +reg_def XMM8k( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(10)); +reg_def XMM8l( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(11)); +reg_def XMM8m( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(12)); +reg_def XMM8n( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(13)); +reg_def XMM8o( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(14)); +reg_def XMM8p( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(15)); reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()); reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1)); @@ -161,6 +237,14 @@ reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5)); reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6)); reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7)); +reg_def XMM9i( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(8)); +reg_def XMM9j( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(9)); +reg_def XMM9k( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(10)); +reg_def XMM9l( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(11)); +reg_def XMM9m( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(12)); +reg_def XMM9n( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(13)); +reg_def XMM9o( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(14)); +reg_def XMM9p( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(15)); reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()); reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1)); @@ -170,6 +254,14 @@ reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5)); reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6)); reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7)); +reg_def XMM10i( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(8)); +reg_def XMM10j( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(9)); +reg_def XMM10k( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(10)); +reg_def XMM10l( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(11)); +reg_def XMM10m( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(12)); +reg_def XMM10n( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(13)); +reg_def XMM10o( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(14)); +reg_def XMM10p( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(15)); reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()); reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1)); @@ -179,6 +271,14 @@ reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5)); reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6)); reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7)); +reg_def XMM11i( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(8)); +reg_def XMM11j( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(9)); +reg_def XMM11k( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(10)); +reg_def XMM11l( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(11)); +reg_def XMM11m( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(12)); +reg_def XMM11n( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(13)); +reg_def XMM11o( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(14)); +reg_def XMM11p( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(15)); reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()); reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1)); @@ -188,6 +288,14 @@ reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5)); reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6)); reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7)); +reg_def XMM12i( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(8)); +reg_def XMM12j( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(9)); +reg_def XMM12k( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(10)); +reg_def XMM12l( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(11)); +reg_def XMM12m( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(12)); +reg_def XMM12n( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(13)); +reg_def XMM12o( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(14)); +reg_def XMM12p( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(15)); reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()); reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1)); @@ -197,6 +305,14 @@ reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5)); reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6)); reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7)); +reg_def XMM13i( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(8)); +reg_def XMM13j( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(9)); +reg_def XMM13k( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(10)); +reg_def XMM13l( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(11)); +reg_def XMM13m( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(12)); +reg_def XMM13n( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(13)); +reg_def XMM13o( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(14)); +reg_def XMM13p( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(15)); reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()); reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1)); @@ -206,6 +322,14 @@ reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5)); reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6)); reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7)); +reg_def XMM14i( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(8)); +reg_def XMM14j( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(9)); +reg_def XMM14k( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(10)); +reg_def XMM14l( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(11)); +reg_def XMM14m( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(12)); +reg_def XMM14n( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(13)); +reg_def XMM14o( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(14)); +reg_def XMM14p( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(15)); reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()); reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1)); @@ -215,6 +339,285 @@ reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5)); reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6)); reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7)); +reg_def XMM15i( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(8)); +reg_def XMM15j( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(9)); +reg_def XMM15k( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(10)); +reg_def XMM15l( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(11)); +reg_def XMM15m( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(12)); +reg_def XMM15n( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(13)); +reg_def XMM15o( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(14)); +reg_def XMM15p( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(15)); + +reg_def XMM16 ( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()); +reg_def XMM16b( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(1)); +reg_def XMM16c( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(2)); +reg_def XMM16d( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(3)); +reg_def XMM16e( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(4)); +reg_def XMM16f( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(5)); +reg_def XMM16g( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(6)); +reg_def XMM16h( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(7)); +reg_def XMM16i( SOC, SOE, Op_RegF, 16, xmm15->as_VMReg()->next(8)); +reg_def XMM16j( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(9)); +reg_def XMM16k( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(10)); +reg_def XMM16l( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(11)); +reg_def XMM16m( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(12)); +reg_def XMM16n( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(13)); +reg_def XMM16o( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(14)); +reg_def XMM16p( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(15)); + +reg_def XMM17 ( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()); +reg_def XMM17b( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(1)); +reg_def XMM17c( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(2)); +reg_def XMM17d( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(3)); +reg_def XMM17e( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(4)); +reg_def XMM17f( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(5)); +reg_def XMM17g( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(6)); +reg_def XMM17h( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(7)); +reg_def XMM17i( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(8)); +reg_def XMM17j( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(9)); +reg_def XMM17k( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(10)); +reg_def XMM17l( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(11)); +reg_def XMM17m( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(12)); +reg_def XMM17n( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(13)); +reg_def XMM17o( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(14)); +reg_def XMM17p( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(15)); + +reg_def XMM18 ( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()); +reg_def XMM18b( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(1)); +reg_def XMM18c( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(2)); +reg_def XMM18d( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(3)); +reg_def XMM18e( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(4)); +reg_def XMM18f( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(5)); +reg_def XMM18g( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(6)); +reg_def XMM18h( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(7)); +reg_def XMM18i( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(8)); +reg_def XMM18j( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(9)); +reg_def XMM18k( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(10)); +reg_def XMM18l( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(11)); +reg_def XMM18m( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(12)); +reg_def XMM18n( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(13)); +reg_def XMM18o( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(14)); +reg_def XMM18p( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(15)); + +reg_def XMM19 ( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()); +reg_def XMM19b( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(1)); +reg_def XMM19c( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(2)); +reg_def XMM19d( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(3)); +reg_def XMM19e( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(4)); +reg_def XMM19f( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(5)); +reg_def XMM19g( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(6)); +reg_def XMM19h( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(7)); +reg_def XMM19i( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(8)); +reg_def XMM19j( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(9)); +reg_def XMM19k( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(10)); +reg_def XMM19l( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(11)); +reg_def XMM19m( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(12)); +reg_def XMM19n( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(13)); +reg_def XMM19o( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(14)); +reg_def XMM19p( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(15)); + +reg_def XMM20 ( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()); +reg_def XMM20b( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(1)); +reg_def XMM20c( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(2)); +reg_def XMM20d( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(3)); +reg_def XMM20e( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(4)); +reg_def XMM20f( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(5)); +reg_def XMM20g( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(6)); +reg_def XMM20h( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(7)); +reg_def XMM20i( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(8)); +reg_def XMM20j( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(9)); +reg_def XMM20k( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(10)); +reg_def XMM20l( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(11)); +reg_def XMM20m( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(12)); +reg_def XMM20n( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(13)); +reg_def XMM20o( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(14)); +reg_def XMM20p( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(15)); + +reg_def XMM21 ( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()); +reg_def XMM21b( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(1)); +reg_def XMM21c( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(2)); +reg_def XMM21d( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(3)); +reg_def XMM21e( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(4)); +reg_def XMM21f( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(5)); +reg_def XMM21g( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(6)); +reg_def XMM21h( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(7)); +reg_def XMM21i( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(8)); +reg_def XMM21j( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(9)); +reg_def XMM21k( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(10)); +reg_def XMM21l( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(11)); +reg_def XMM21m( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(12)); +reg_def XMM21n( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(13)); +reg_def XMM21o( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(14)); +reg_def XMM21p( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(15)); + +reg_def XMM22 ( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()); +reg_def XMM22b( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(1)); +reg_def XMM22c( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(2)); +reg_def XMM22d( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(3)); +reg_def XMM22e( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(4)); +reg_def XMM22f( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(5)); +reg_def XMM22g( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(6)); +reg_def XMM22h( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(7)); +reg_def XMM22i( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(8)); +reg_def XMM22j( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(9)); +reg_def XMM22k( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(10)); +reg_def XMM22l( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(11)); +reg_def XMM22m( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(12)); +reg_def XMM22n( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(13)); +reg_def XMM22o( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(14)); +reg_def XMM22p( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(15)); + +reg_def XMM23 ( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()); +reg_def XMM23b( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(1)); +reg_def XMM23c( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(2)); +reg_def XMM23d( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(3)); +reg_def XMM23e( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(4)); +reg_def XMM23f( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(5)); +reg_def XMM23g( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(6)); +reg_def XMM23h( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(7)); +reg_def XMM23i( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(8)); +reg_def XMM23j( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(9)); +reg_def XMM23k( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(10)); +reg_def XMM23l( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(11)); +reg_def XMM23m( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(12)); +reg_def XMM23n( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(13)); +reg_def XMM23o( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(14)); +reg_def XMM23p( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(15)); + +reg_def XMM24 ( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()); +reg_def XMM24b( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(1)); +reg_def XMM24c( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(2)); +reg_def XMM24d( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(3)); +reg_def XMM24e( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(4)); +reg_def XMM24f( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(5)); +reg_def XMM24g( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(6)); +reg_def XMM24h( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(7)); +reg_def XMM24i( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(8)); +reg_def XMM24j( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(9)); +reg_def XMM24k( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(10)); +reg_def XMM24l( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(11)); +reg_def XMM24m( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(12)); +reg_def XMM24n( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(13)); +reg_def XMM24o( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(14)); +reg_def XMM24p( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(15)); + +reg_def XMM25 ( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()); +reg_def XMM25b( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(1)); +reg_def XMM25c( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(2)); +reg_def XMM25d( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(3)); +reg_def XMM25e( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(4)); +reg_def XMM25f( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(5)); +reg_def XMM25g( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(6)); +reg_def XMM25h( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(7)); +reg_def XMM25i( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(8)); +reg_def XMM25j( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(9)); +reg_def XMM25k( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(10)); +reg_def XMM25l( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(11)); +reg_def XMM25m( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(12)); +reg_def XMM25n( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(13)); +reg_def XMM25o( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(14)); +reg_def XMM25p( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(15)); + +reg_def XMM26 ( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()); +reg_def XMM26b( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(1)); +reg_def XMM26c( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(2)); +reg_def XMM26d( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(3)); +reg_def XMM26e( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(4)); +reg_def XMM26f( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(5)); +reg_def XMM26g( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(6)); +reg_def XMM26h( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(7)); +reg_def XMM26i( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(8)); +reg_def XMM26j( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(9)); +reg_def XMM26k( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(10)); +reg_def XMM26l( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(11)); +reg_def XMM26m( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(12)); +reg_def XMM26n( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(13)); +reg_def XMM26o( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(14)); +reg_def XMM26p( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(15)); + +reg_def XMM27g( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(1)); +reg_def XMM27c( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(2)); +reg_def XMM27d( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(3)); +reg_def XMM27e( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(4)); +reg_def XMM27f( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(5)); +reg_def XMM27g( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(6)); +reg_def XMM27h( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(7)); +reg_def XMM27i( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(8)); +reg_def XMM27j( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(9)); +reg_def XMM27k( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(10)); +reg_def XMM27l( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(11)); +reg_def XMM27m( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(12)); +reg_def XMM27n( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(13)); +reg_def XMM27o( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(14)); +reg_def XMM27p( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(15)); + +reg_def XMM28 ( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()); +reg_def XMM28b( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(1)); +reg_def XMM28c( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(2)); +reg_def XMM28d( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(3)); +reg_def XMM28e( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(4)); +reg_def XMM28f( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(5)); +reg_def XMM28g( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(6)); +reg_def XMM28h( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(7)); +reg_def XMM28i( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(8)); +reg_def XMM28j( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(9)); +reg_def XMM28k( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(10)); +reg_def XMM28l( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(11)); +reg_def XMM28m( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(12)); +reg_def XMM28n( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(13)); +reg_def XMM28o( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(14)); +reg_def XMM28p( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(15)); + +reg_def XMM29 ( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()); +reg_def XMM29b( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(1)); +reg_def XMM29c( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(2)); +reg_def XMM29d( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(3)); +reg_def XMM29e( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(4)); +reg_def XMM29f( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(5)); +reg_def XMM29g( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(6)); +reg_def XMM29h( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(7)); +reg_def XMM29i( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(8)); +reg_def XMM29j( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(9)); +reg_def XMM29k( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(10)); +reg_def XMM29l( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(11)); +reg_def XMM29m( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(12)); +reg_def XMM29n( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(13)); +reg_def XMM29o( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(14)); +reg_def XMM29p( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(15)); + +reg_def XMM30 ( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()); +reg_def XMM30b( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(1)); +reg_def XMM30c( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(2)); +reg_def XMM30d( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(3)); +reg_def XMM30e( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(4)); +reg_def XMM30f( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(5)); +reg_def XMM30g( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(6)); +reg_def XMM30h( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(7)); +reg_def XMM30i( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(8)); +reg_def XMM30j( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(9)); +reg_def XMM30k( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(10)); +reg_def XMM30l( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(11)); +reg_def XMM30m( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(12)); +reg_def XMM30n( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(13)); +reg_def XMM30o( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(14)); +reg_def XMM30p( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(15)); + +reg_def XMM31 ( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()); +reg_def XMM31b( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(1)); +reg_def XMM31c( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(2)); +reg_def XMM31d( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(3)); +reg_def XMM31e( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(4)); +reg_def XMM31f( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(5)); +reg_def XMM31g( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(6)); +reg_def XMM31h( SOC, SOE, Op_RegF, 31, xmm31>-as_VMReg()->next(7)); +reg_def XMM31i( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(8)); +reg_def XMM31j( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(9)); +reg_def XMM31k( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(10)); +reg_def XMM31l( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(11)); +reg_def XMM31m( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(12)); +reg_def XMM31n( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(13)); +reg_def XMM31o( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(14)); +reg_def XMM31p( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(15)); #else // _WIN64 @@ -226,6 +629,14 @@ reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5)); reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6)); reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7)); +reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8)); +reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9)); +reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10)); +reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11)); +reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12)); +reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13)); +reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14)); +reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15)); reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()); reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1)); @@ -235,6 +646,14 @@ reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5)); reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6)); reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7)); +reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8)); +reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9)); +reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10)); +reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11)); +reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12)); +reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13)); +reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14)); +reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15)); #ifdef _LP64 @@ -246,6 +665,14 @@ reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5)); reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6)); reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7)); +reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8)); +reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9)); +reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10)); +reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11)); +reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12)); +reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13)); +reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14)); +reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15)); reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()); reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1)); @@ -255,6 +682,14 @@ reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5)); reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6)); reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7)); +reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8)); +reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9)); +reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10)); +reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11)); +reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12)); +reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13)); +reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14)); +reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15)); reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()); reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1)); @@ -264,6 +699,14 @@ reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5)); reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6)); reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7)); +reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8)); +reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9)); +reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10)); +reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11)); +reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12)); +reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13)); +reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14)); +reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15)); reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()); reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1)); @@ -273,6 +716,14 @@ reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5)); reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6)); reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7)); +reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8)); +reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9)); +reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10)); +reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11)); +reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12)); +reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13)); +reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14)); +reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15)); reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()); reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1)); @@ -282,6 +733,14 @@ reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5)); reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6)); reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7)); +reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8)); +reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9)); +reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10)); +reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11)); +reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12)); +reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13)); +reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14)); +reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15)); reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()); reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1)); @@ -291,6 +750,14 @@ reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5)); reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6)); reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7)); +reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8)); +reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9)); +reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10)); +reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11)); +reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12)); +reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13)); +reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14)); +reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15)); reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()); reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1)); @@ -300,6 +767,14 @@ reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5)); reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6)); reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7)); +reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8)); +reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9)); +reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10)); +reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11)); +reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12)); +reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13)); +reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14)); +reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15)); reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()); reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1)); @@ -309,6 +784,286 @@ reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5)); reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6)); reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7)); +reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8)); +reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9)); +reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10)); +reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11)); +reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12)); +reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13)); +reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14)); +reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15)); + +reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()); +reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1)); +reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2)); +reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3)); +reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4)); +reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5)); +reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6)); +reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7)); +reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8)); +reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9)); +reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10)); +reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11)); +reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12)); +reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13)); +reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14)); +reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15)); + +reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()); +reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1)); +reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2)); +reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3)); +reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4)); +reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5)); +reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6)); +reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7)); +reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8)); +reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9)); +reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10)); +reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11)); +reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12)); +reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13)); +reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14)); +reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15)); + +reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()); +reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1)); +reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2)); +reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3)); +reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4)); +reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5)); +reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6)); +reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7)); +reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8)); +reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9)); +reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10)); +reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11)); +reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12)); +reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13)); +reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14)); +reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15)); + +reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()); +reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1)); +reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2)); +reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3)); +reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4)); +reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5)); +reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6)); +reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7)); +reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8)); +reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9)); +reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10)); +reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11)); +reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12)); +reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13)); +reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14)); +reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15)); + +reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()); +reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1)); +reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2)); +reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3)); +reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4)); +reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5)); +reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6)); +reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7)); +reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8)); +reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9)); +reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10)); +reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11)); +reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12)); +reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13)); +reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14)); +reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15)); + +reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()); +reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1)); +reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2)); +reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3)); +reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4)); +reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5)); +reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6)); +reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7)); +reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8)); +reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9)); +reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10)); +reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11)); +reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12)); +reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13)); +reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14)); +reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15)); + +reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()); +reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1)); +reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2)); +reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3)); +reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4)); +reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5)); +reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6)); +reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7)); +reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8)); +reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9)); +reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10)); +reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11)); +reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12)); +reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13)); +reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14)); +reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15)); + +reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()); +reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1)); +reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2)); +reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3)); +reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4)); +reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5)); +reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6)); +reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7)); +reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8)); +reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9)); +reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10)); +reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11)); +reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12)); +reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13)); +reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14)); +reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15)); + +reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()); +reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1)); +reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2)); +reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3)); +reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4)); +reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5)); +reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6)); +reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7)); +reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8)); +reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9)); +reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10)); +reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11)); +reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12)); +reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13)); +reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14)); +reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15)); + +reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()); +reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1)); +reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2)); +reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3)); +reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4)); +reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5)); +reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6)); +reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7)); +reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8)); +reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9)); +reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10)); +reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11)); +reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12)); +reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13)); +reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14)); +reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15)); + +reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()); +reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1)); +reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2)); +reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3)); +reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4)); +reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5)); +reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6)); +reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7)); +reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8)); +reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9)); +reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10)); +reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11)); +reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12)); +reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13)); +reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14)); +reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15)); + +reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()); +reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1)); +reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2)); +reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3)); +reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4)); +reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5)); +reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6)); +reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7)); +reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8)); +reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9)); +reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10)); +reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11)); +reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12)); +reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13)); +reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14)); +reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15)); + +reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()); +reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1)); +reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2)); +reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3)); +reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4)); +reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5)); +reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6)); +reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7)); +reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8)); +reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9)); +reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10)); +reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11)); +reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12)); +reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13)); +reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14)); +reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15)); + +reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()); +reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1)); +reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2)); +reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3)); +reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4)); +reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5)); +reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6)); +reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7)); +reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8)); +reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9)); +reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10)); +reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11)); +reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12)); +reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13)); +reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14)); +reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15)); + +reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()); +reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1)); +reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2)); +reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3)); +reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4)); +reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5)); +reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6)); +reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7)); +reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8)); +reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9)); +reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10)); +reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11)); +reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12)); +reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13)); +reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14)); +reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15)); + +reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()); +reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1)); +reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2)); +reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3)); +reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4)); +reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5)); +reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6)); +reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7)); +reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8)); +reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9)); +reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10)); +reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11)); +reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12)); +reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13)); +reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14)); +reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15)); #endif // _LP64 @@ -320,34 +1075,50 @@ reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad()); #endif // _LP64 -alloc_class chunk1(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, - XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, - XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, - XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, - XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, - XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, - XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, - XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h +alloc_class chunk1(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p, + XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p, + XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p, + XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p, + XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p, + XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p, + XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p, + XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p #ifdef _LP64 - ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, - XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, - XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, - XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, - XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, - XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, - XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, - XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h + ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p, + XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p, + XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p, + XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p, + XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p, + XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p, + XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p, + XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p + ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p, + XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p, + XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p, + XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p, + XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p, + XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p, + XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p, + XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p, + XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p, + XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p, + XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p, + XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p, + XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p, + XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p, + XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p, + XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p #endif - ); + ); // flags allocation class should be last. -alloc_class chunk2(RFLAGS); +alloc_class chunk3(RFLAGS); // Singleton class for condition codes reg_class int_flags(RFLAGS); -// Class for all float registers -reg_class float_reg(XMM0, +// Class for pre evex float registers +reg_class float_reg_legacy(XMM0, XMM1, XMM2, XMM3, @@ -367,8 +1138,47 @@ #endif ); -// Class for all double registers -reg_class double_reg(XMM0, XMM0b, +// Class for evex float registers +reg_class float_reg_evex(XMM0, + XMM1, + XMM2, + XMM3, + XMM4, + XMM5, + XMM6, + XMM7 +#ifdef _LP64 + ,XMM8, + XMM9, + XMM10, + XMM11, + XMM12, + XMM13, + XMM14, + XMM15, + XMM16, + XMM17, + XMM18, + XMM19, + XMM20, + XMM21, + XMM22, + XMM23, + XMM24, + XMM25, + XMM26, + XMM27, + XMM28, + XMM29, + XMM30, + XMM31 +#endif + ); + +reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} ); + +// Class for pre evex double registers +reg_class double_reg_legacy(XMM0, XMM0b, XMM1, XMM1b, XMM2, XMM2b, XMM3, XMM3b, @@ -388,8 +1198,47 @@ #endif ); -// Class for all 32bit vector registers -reg_class vectors_reg(XMM0, +// Class for evex double registers +reg_class double_reg_evex(XMM0, XMM0b, + XMM1, XMM1b, + XMM2, XMM2b, + XMM3, XMM3b, + XMM4, XMM4b, + XMM5, XMM5b, + XMM6, XMM6b, + XMM7, XMM7b +#ifdef _LP64 + ,XMM8, XMM8b, + XMM9, XMM9b, + XMM10, XMM10b, + XMM11, XMM11b, + XMM12, XMM12b, + XMM13, XMM13b, + XMM14, XMM14b, + XMM15, XMM15b, + XMM16, XMM16b, + XMM17, XMM17b, + XMM18, XMM18b, + XMM19, XMM19b, + XMM20, XMM20b, + XMM21, XMM21b, + XMM22, XMM22b, + XMM23, XMM23b, + XMM24, XMM24b, + XMM25, XMM25b, + XMM26, XMM26b, + XMM27, XMM27b, + XMM28, XMM28b, + XMM29, XMM29b, + XMM30, XMM30b, + XMM31, XMM31b +#endif + ); + +reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} ); + +// Class for pre evex 32bit vector registers +reg_class vectors_reg_legacy(XMM0, XMM1, XMM2, XMM3, @@ -409,8 +1258,47 @@ #endif ); +// Class for evex 32bit vector registers +reg_class vectors_reg_evex(XMM0, + XMM1, + XMM2, + XMM3, + XMM4, + XMM5, + XMM6, + XMM7 +#ifdef _LP64 + ,XMM8, + XMM9, + XMM10, + XMM11, + XMM12, + XMM13, + XMM14, + XMM15, + XMM16, + XMM17, + XMM18, + XMM19, + XMM20, + XMM21, + XMM22, + XMM23, + XMM24, + XMM25, + XMM26, + XMM27, + XMM28, + XMM29, + XMM30, + XMM31 +#endif + ); + +reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} ); + // Class for all 64bit vector registers -reg_class vectord_reg(XMM0, XMM0b, +reg_class vectord_reg_legacy(XMM0, XMM0b, XMM1, XMM1b, XMM2, XMM2b, XMM3, XMM3b, @@ -430,8 +1318,47 @@ #endif ); +// Class for all 64bit vector registers +reg_class vectord_reg_evex(XMM0, XMM0b, + XMM1, XMM1b, + XMM2, XMM2b, + XMM3, XMM3b, + XMM4, XMM4b, + XMM5, XMM5b, + XMM6, XMM6b, + XMM7, XMM7b +#ifdef _LP64 + ,XMM8, XMM8b, + XMM9, XMM9b, + XMM10, XMM10b, + XMM11, XMM11b, + XMM12, XMM12b, + XMM13, XMM13b, + XMM14, XMM14b, + XMM15, XMM15b, + XMM16, XMM16b, + XMM17, XMM17b, + XMM18, XMM18b, + XMM19, XMM19b, + XMM20, XMM20b, + XMM21, XMM21b, + XMM22, XMM22b, + XMM23, XMM23b, + XMM24, XMM24b, + XMM25, XMM25b, + XMM26, XMM26b, + XMM27, XMM27b, + XMM28, XMM28b, + XMM29, XMM29b, + XMM30, XMM30b, + XMM31, XMM31b +#endif + ); + +reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} ); + // Class for all 128bit vector registers -reg_class vectorx_reg(XMM0, XMM0b, XMM0c, XMM0d, +reg_class vectorx_reg_legacy(XMM0, XMM0b, XMM0c, XMM0d, XMM1, XMM1b, XMM1c, XMM1d, XMM2, XMM2b, XMM2c, XMM2d, XMM3, XMM3b, XMM3c, XMM3d, @@ -451,8 +1378,47 @@ #endif ); +// Class for all 128bit vector registers +reg_class vectorx_reg_evex(XMM0, XMM0b, XMM0c, XMM0d, + XMM1, XMM1b, XMM1c, XMM1d, + XMM2, XMM2b, XMM2c, XMM2d, + XMM3, XMM3b, XMM3c, XMM3d, + XMM4, XMM4b, XMM4c, XMM4d, + XMM5, XMM5b, XMM5c, XMM5d, + XMM6, XMM6b, XMM6c, XMM6d, + XMM7, XMM7b, XMM7c, XMM7d +#ifdef _LP64 + ,XMM8, XMM8b, XMM8c, XMM8d, + XMM9, XMM9b, XMM9c, XMM9d, + XMM10, XMM10b, XMM10c, XMM10d, + XMM11, XMM11b, XMM11c, XMM11d, + XMM12, XMM12b, XMM12c, XMM12d, + XMM13, XMM13b, XMM13c, XMM13d, + XMM14, XMM14b, XMM14c, XMM14d, + XMM15, XMM15b, XMM15c, XMM15d, + XMM16, XMM16b, XMM16c, XMM16d, + XMM17, XMM17b, XMM17c, XMM17d, + XMM18, XMM18b, XMM18c, XMM18d, + XMM19, XMM19b, XMM19c, XMM19d, + XMM20, XMM20b, XMM20c, XMM20d, + XMM21, XMM21b, XMM21c, XMM21d, + XMM22, XMM22b, XMM22c, XMM22d, + XMM23, XMM23b, XMM23c, XMM23d, + XMM24, XMM24b, XMM24c, XMM24d, + XMM25, XMM25b, XMM25c, XMM25d, + XMM26, XMM26b, XMM26c, XMM26d, + XMM27, XMM27b, XMM27c, XMM27d, + XMM28, XMM28b, XMM28c, XMM28d, + XMM29, XMM29b, XMM29c, XMM29d, + XMM30, XMM30b, XMM30c, XMM30d, + XMM31, XMM31b, XMM31c, XMM31d +#endif + ); + +reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} ); + // Class for all 256bit vector registers -reg_class vectory_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, +reg_class vectory_reg_legacy(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, @@ -472,6 +1438,82 @@ #endif ); +// Class for all 256bit vector registers +reg_class vectory_reg_evex(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, + XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, + XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, + XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, + XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, + XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, + XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, + XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h +#ifdef _LP64 + ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, + XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, + XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, + XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, + XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, + XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, + XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, + XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, + XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, + XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, + XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, + XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, + XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, + XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, + XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, + XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, + XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, + XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, + XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, + XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, + XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, + XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, + XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, + XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h +#endif + ); + +reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} ); + +// Class for all 512bit vector registers +reg_class vectorz_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p, + XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p, + XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p, + XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p, + XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p, + XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p, + XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p, + XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p +#ifdef _LP64 + ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p, + XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p, + XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p, + XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p, + XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p, + XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p, + XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p, + XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p + ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p, + XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p, + XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p, + XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p, + XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p, + XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p, + XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p, + XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p, + XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p, + XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p, + XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p, + XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p, + XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p, + XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p, + XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p, + XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p +#endif + ); + %} @@ -623,6 +1665,10 @@ if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX return false; break; + case Op_MulVL: + case Op_MulReductionVL: + if (VM_Version::supports_avx512dq() == false) + return false; case Op_AddReductionVL: if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here return false; @@ -657,10 +1703,11 @@ if (UseSSE < 2) return 0; // SSE2 supports 128bit vectors for all types. // AVX2 supports 256bit vectors for all types. - int size = (UseAVX > 1) ? 32 : 16; + // AVX2/EVEX supports 512bit vectors for all types. + int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16; // AVX1 supports 256bit vectors only for FLOAT and DOUBLE. if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE)) - size = 32; + size = (UseAVX > 2) ? 64 : 32; // Use flag to limit vector size. size = MIN2(size,(int)MaxVectorSize); // Minimum 2 values in vector (or 4 for bytes). @@ -702,6 +1749,7 @@ case 8: return Op_VecD; case 16: return Op_VecX; case 32: return Op_VecY; + case 64: return Op_VecZ; } ShouldNotReachHere(); return 0; @@ -745,6 +1793,9 @@ case Op_VecY: __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo])); break; + case Op_VecZ: + __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2); + break; default: ShouldNotReachHere(); } @@ -763,6 +1814,7 @@ st->print("movdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]); break; case Op_VecY: + case Op_VecZ: st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]); break; default: @@ -771,7 +1823,7 @@ #endif } // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix. - return 4; + return (UseAVX > 2) ? 6 : 4; } static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load, @@ -796,6 +1848,9 @@ case Op_VecY: __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset)); break; + case Op_VecZ: + __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2); + break; default: ShouldNotReachHere(); } @@ -813,13 +1868,16 @@ case Op_VecY: __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg])); break; + case Op_VecZ: + __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2); + break; default: ShouldNotReachHere(); } } int size = __ offset() - offset; #ifdef ASSERT - int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4); + int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4); // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. assert(!do_size || size == (5+offset_size), "incorrect size calculattion"); #endif @@ -838,6 +1896,7 @@ st->print("movdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset); break; case Op_VecY: + case Op_VecZ: st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset); break; default: @@ -855,6 +1914,7 @@ st->print("movdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]); break; case Op_VecY: + case Op_VecZ: st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]); break; default: @@ -863,7 +1923,7 @@ } #endif } - int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4); + int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4); // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. return 5+offset_size; } @@ -967,40 +2027,15 @@ // in the ADLC because operands constitute user defined types which are used in // instruction definitions. -// Vectors -operand vecS() %{ - constraint(ALLOC_IN_RC(vectors_reg)); - match(VecS); - - format %{ %} - interface(REG_INTER); -%} - -operand vecD() %{ - constraint(ALLOC_IN_RC(vectord_reg)); - match(VecD); - - format %{ %} - interface(REG_INTER); -%} - -operand vecX() %{ - constraint(ALLOC_IN_RC(vectorx_reg)); - match(VecX); - - format %{ %} - interface(REG_INTER); -%} - -operand vecY() %{ - constraint(ALLOC_IN_RC(vectory_reg)); - match(VecY); +// This one generically applies only for evex, so only one version +operand vecZ() %{ + constraint(ALLOC_IN_RC(vectorz_reg)); + match(VecZ); format %{ %} interface(REG_INTER); %} - // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit) // ============================================================================ @@ -1601,9 +2636,9 @@ ins_cost(150); format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %} ins_encode %{ - bool vector256 = false; + int vector_len = 0; __ vandps($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(float_signmask()), vector256); + ExternalAddress(float_signmask()), vector_len); %} ins_pipe(pipe_slow); %} @@ -1627,9 +2662,9 @@ format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t" "# abs double by sign masking" %} ins_encode %{ - bool vector256 = false; + int vector_len = 0; __ vandpd($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(double_signmask()), vector256); + ExternalAddress(double_signmask()), vector_len); %} ins_pipe(pipe_slow); %} @@ -1651,9 +2686,9 @@ ins_cost(150); format %{ "vxorps $dst, $src, [0x80000000]\t# neg float by sign flipping" %} ins_encode %{ - bool vector256 = false; + int vector_len = 0; __ vxorps($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(float_signflip()), vector256); + ExternalAddress(float_signflip()), vector_len); %} ins_pipe(pipe_slow); %} @@ -1677,9 +2712,9 @@ format %{ "vxorpd $dst, $src, [0x8000000000000000]\t" "# neg double by sign flipping" %} ins_encode %{ - bool vector256 = false; + int vector_len = 0; __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(double_signflip()), vector256); + ExternalAddress(double_signflip()), vector_len); %} ins_pipe(pipe_slow); %} @@ -1754,7 +2789,6 @@ ins_pipe(pipe_slow); %} - // ====================VECTOR INSTRUCTIONS===================================== // Load vectors (4 bytes long) @@ -1805,6 +2839,19 @@ ins_pipe( pipe_slow ); %} +// Load vectors (64 bytes long) +instruct loadV64(vecZ dst, memory mem) %{ + predicate(n->as_LoadVector()->memory_size() == 64); + match(Set dst (LoadVector mem)); + ins_cost(125); + format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + // Store vectors instruct storeV4(memory mem, vecS src) %{ predicate(n->as_StoreVector()->memory_size() == 4); @@ -1850,6 +2897,18 @@ ins_pipe( pipe_slow ); %} +instruct storeV64(memory mem, vecZ src) %{ + predicate(n->as_StoreVector()->memory_size() == 64); + match(Set mem (StoreVector mem src)); + ins_cost(145); + format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + // Replicate byte scalar to be vector instruct Repl4B(vecS dst, rRegI src) %{ predicate(n->as_Vector()->length() == 4); @@ -1913,6 +2972,26 @@ ins_pipe( pipe_slow ); %} +instruct Repl64B(vecZ dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 64); + match(Set dst (ReplicateB src)); + format %{ "movd $dst,$src\n\t" + "punpcklbw $dst,$dst\n\t" + "pshuflw $dst,$dst,0x00\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate byte scalar immediate to be vector by loading from const table. instruct Repl4B_imm(vecS dst, immI con) %{ predicate(n->as_Vector()->length() == 4); @@ -1960,6 +3039,22 @@ ins_pipe( pipe_slow ); %} +instruct Repl64B_imm(vecZ dst, immI con) %{ + predicate(n->as_Vector()->length() == 64); + match(Set dst (ReplicateB con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate byte scalar zero to be vector instruct Repl4B_zero(vecS dst, immI0 zero) %{ predicate(n->as_Vector()->length() == 4); @@ -1997,8 +3092,20 @@ format %{ "vpxor $dst,$dst,$dst\t! replicate32B zero" %} ins_encode %{ // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). - bool vector256 = true; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl64B_zero(vecZ dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 64); + match(Set dst (ReplicateB zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -2058,6 +3165,24 @@ ins_pipe( pipe_slow ); %} +instruct Repl32S(vecZ dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 32); + match(Set dst (ReplicateS src)); + format %{ "movd $dst,$src\n\t" + "pshuflw $dst,$dst,0x00\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table. instruct Repl2S_imm(vecS dst, immI con) %{ predicate(n->as_Vector()->length() == 2); @@ -2105,6 +3230,22 @@ ins_pipe( pipe_slow ); %} +instruct Repl32S_imm(vecZ dst, immI con) %{ + predicate(n->as_Vector()->length() == 32); + match(Set dst (ReplicateS con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate char/short (2 byte) scalar zero to be vector instruct Repl2S_zero(vecS dst, immI0 zero) %{ predicate(n->as_Vector()->length() == 2); @@ -2142,8 +3283,20 @@ format %{ "vpxor $dst,$dst,$dst\t! replicate16S zero" %} ins_encode %{ // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). - bool vector256 = true; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl32S_zero(vecZ dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 32); + match(Set dst (ReplicateS zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). + int vector_len = 2; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -2187,6 +3340,22 @@ ins_pipe( pipe_slow ); %} +instruct Repl16I(vecZ dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (ReplicateI src)); + format %{ "movd $dst,$src\n\t" + "pshufd $dst,$dst,0x00\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate integer (4 byte) scalar immediate to be vector by loading from const table. instruct Repl2I_imm(vecD dst, immI con) %{ predicate(n->as_Vector()->length() == 2); @@ -2224,6 +3393,22 @@ ins_pipe( pipe_slow ); %} +instruct Repl16I_imm(vecZ dst, immI con) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (ReplicateI con)); + format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\n\t" + "vinserti64x4h $dst k0,$dst,$dst" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Integer could be loaded into xmm register directly from memory. instruct Repl2I_mem(vecD dst, memory mem) %{ predicate(n->as_Vector()->length() == 2); @@ -2263,13 +3448,29 @@ ins_pipe( pipe_slow ); %} -// Replicate integer (4 byte) scalar zero to be vector -instruct Repl2I_zero(vecD dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateI zero)); - format %{ "pxor $dst,$dst\t! replicate2I" %} +instruct Repl16I_mem(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "movd $dst,$mem\n\t" + "pshufd $dst,$dst,0x00\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %} ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + __ movdl($dst$$XMMRegister, $mem$$Address); + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +// Replicate integer (4 byte) scalar zero to be vector +instruct Repl2I_zero(vecD dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateI zero)); + format %{ "pxor $dst,$dst\t! replicate2I" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); %} ins_pipe( fpu_reg_reg ); %} @@ -2290,8 +3491,20 @@ format %{ "vpxor $dst,$dst,$dst\t! replicate8I zero" %} ins_encode %{ // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). - bool vector256 = true; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl16I_zero(vecZ dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (ReplicateI zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate16I zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it). + int vector_len = 2; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -2323,6 +3536,22 @@ %} ins_pipe( pipe_slow ); %} + +instruct Repl8L(vecZ dst, rRegL src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateL src)); + format %{ "movdq $dst,$src\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + ins_encode %{ + __ movdq($dst$$XMMRegister, $src$$Register); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} #else // _LP64 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{ predicate(n->as_Vector()->length() == 2); @@ -2359,6 +3588,26 @@ %} ins_pipe( pipe_slow ); %} + +instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateL src)); + effect(TEMP dst, USE src, TEMP tmp); + format %{ "movdl $dst,$src.lo\n\t" + "movdl $tmp,$src.hi\n\t" + "punpckldq $dst,$tmp\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); + __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} #endif // _LP64 // Replicate long (8 byte) scalar immediate to be vector by loading from const table. @@ -2388,6 +3637,22 @@ ins_pipe( pipe_slow ); %} +instruct Repl8L_imm(vecZ dst, immL con) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateL con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress($con)); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Long could be loaded into xmm register directly from memory. instruct Repl2L_mem(vecX dst, memory mem) %{ predicate(n->as_Vector()->length() == 2); @@ -2415,6 +3680,22 @@ ins_pipe( pipe_slow ); %} +instruct Repl8L_mem(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateL (LoadL mem))); + format %{ "movq $dst,$mem\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $mem$$Address); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate long (8 byte) scalar zero to be vector instruct Repl2L_zero(vecX dst, immL0 zero) %{ predicate(n->as_Vector()->length() == 2); @@ -2432,8 +3713,20 @@ format %{ "vpxor $dst,$dst,$dst\t! replicate4L zero" %} ins_encode %{ // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). - bool vector256 = true; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8L_zero(vecZ dst, immL0 zero) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateL zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate8L zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -2471,6 +3764,20 @@ ins_pipe( pipe_slow ); %} +instruct Repl16F(vecZ dst, regF src) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (ReplicateF src)); + format %{ "pshufd $dst,$src,0x00\n\t" + "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t" + "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate float (4 byte) scalar zero to be vector instruct Repl2F_zero(vecD dst, immF0 zero) %{ predicate(n->as_Vector()->length() == 2); @@ -2497,8 +3804,19 @@ match(Set dst (ReplicateF zero)); format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %} ins_encode %{ - bool vector256 = true; - __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); + int vector_len = 1; + __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl16F_zero(vecZ dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (ReplicateF zero)); + format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %} + ins_encode %{ + int vector_len = 2; + __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -2526,6 +3844,20 @@ ins_pipe( pipe_slow ); %} +instruct Repl8D(vecZ dst, regD src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateD src)); + format %{ "pshufd $dst,$src,0x44\n\t" + "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t" + "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // Replicate double (8 byte) scalar zero to be vector instruct Repl2D_zero(vecX dst, immD0 zero) %{ predicate(n->as_Vector()->length() == 2); @@ -2542,8 +3874,19 @@ match(Set dst (ReplicateD zero)); format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %} ins_encode %{ - bool vector256 = true; - __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); + int vector_len = 1; + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8D_zero(vecZ dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateD zero)); + format %{ "vxorpd $dst k0,$dst,$dst,vect512\t! replicate8D zero" %} + ins_encode %{ + int vector_len = 2; + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -2570,17 +3913,38 @@ %} instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ - predicate(UseAVX > 0); + predicate(UseAVX > 0 && UseAVX < 3); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vphaddd $tmp,$src2,$src2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! add reduction2I" %} + ins_encode %{ + int vector_len = 0; + __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2); match(Set dst (AddReductionVI src1 src2)); effect(TEMP tmp, TEMP tmp2); - format %{ "vphaddd $tmp,$src2,$src2\n\t" + format %{ "pshufd $tmp2,$src2,0x1\n\t" + "vpaddd $tmp,$src2,$tmp2\n\t" "movd $tmp2,$src1\n\t" - "vpaddd $tmp2,$tmp2,$tmp\n\t" + "vpaddd $tmp2,$tmp,$tmp2\n\t" "movd $dst,$tmp2\t! add reduction2I" %} ins_encode %{ - __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false); + int vector_len = 0; + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($tmp2$$XMMRegister, $src1$$Register); - __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); + __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($dst$$Register, $tmp2$$XMMRegister); %} ins_pipe( pipe_slow ); @@ -2608,47 +3972,203 @@ %} instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ - predicate(UseAVX > 0); + predicate(UseAVX > 0 && UseAVX < 3); match(Set dst (AddReductionVI src1 src2)); effect(TEMP tmp, TEMP tmp2); - format %{ "vphaddd $tmp,$src2,$src2\n\t" - "vphaddd $tmp,$tmp,$tmp2\n\t" + format %{ "vphaddd $tmp,$src2,$src2\n\t" + "vphaddd $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! add reduction4I" %} + ins_encode %{ + int vector_len = 0; + __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0xE\n\t" + "vpaddd $tmp,$src2,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpaddd $tmp,$tmp,$tmp2\n\t" "movd $tmp2,$src1\n\t" - "vpaddd $tmp2,$tmp2,$tmp\n\t" + "vpaddd $tmp2,$tmp,$tmp2\n\t" "movd $dst,$tmp2\t! add reduction4I" %} ins_encode %{ - __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false); - __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + int vector_len = 0; + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($tmp2$$XMMRegister, $src1$$Register); - __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); + __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($dst$$Register, $tmp2$$XMMRegister); %} ins_pipe( pipe_slow ); %} instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{ - predicate(UseAVX > 0); + predicate(UseAVX > 0 && UseAVX < 3); match(Set dst (AddReductionVI src1 src2)); effect(TEMP tmp, TEMP tmp2); - format %{ "vphaddd $tmp,$src2,$src2\n\t" - "vphaddd $tmp,$tmp,$tmp2\n\t" - "vextractf128 $tmp2,$tmp\n\t" + format %{ "vphaddd $tmp,$src2,$src2\n\t" + "vphaddd $tmp,$tmp,$tmp2\n\t" + "vextracti128 $tmp2,$tmp\n\t" + "vpaddd $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! add reduction8I" %} + ins_encode %{ + int vector_len = 1; + __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vextracti128 $tmp,$src2\n\t" + "vpaddd $tmp,$tmp,$src2\n\t" + "pshufd $tmp2,$tmp,0xE\n\t" + "vpaddd $tmp,$tmp,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" "vpaddd $tmp,$tmp,$tmp2\n\t" "movd $tmp2,$src1\n\t" - "vpaddd $tmp2,$tmp2,$tmp\n\t" + "vpaddd $tmp2,$tmp,$tmp2\n\t" "movd $dst,$tmp2\t! add reduction8I" %} ins_encode %{ - __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true); - __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true); - __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister); - __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + int vector_len = 0; + __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vextracti64x4 $tmp3,$src2\n\t" + "vpaddd $tmp3,$tmp3,$src2\n\t" + "vextracti128 $tmp,$tmp3\n\t" + "vpaddd $tmp,$tmp,$tmp3\n\t" + "pshufd $tmp2,$tmp,0xE\n\t" + "vpaddd $tmp,$tmp,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpaddd $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction16I" %} + ins_encode %{ + __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1); + __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); __ movdl($tmp2$$XMMRegister, $src1$$Register); - __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); + __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); __ movdl($dst$$Register, $tmp2$$XMMRegister); %} ins_pipe( pipe_slow ); %} +#ifdef _LP64 +instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0xE\n\t" + "vpaddq $tmp,$src2,$tmp2\n\t" + "movdq $tmp2,$src1\n\t" + "vpaddq $tmp2,$tmp,$tmp2\n\t" + "movdq $dst,$tmp2\t! add reduction2L" %} + ins_encode %{ + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0); + __ movdq($tmp2$$XMMRegister, $src1$$Register); + __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); + __ movdq($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vextracti64x2 $tmp,$src2, 0x1\n\t" + "vpaddq $tmp2,$tmp,$src2\n\t" + "pshufd $tmp,$tmp2,0xE\n\t" + "vpaddq $tmp2,$tmp2,$tmp\n\t" + "movdq $tmp,$src1\n\t" + "vpaddq $tmp2,$tmp2,$tmp\n\t" + "movdq $dst,$tmp2\t! add reduction4L" %} + ins_encode %{ + __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0); + __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE); + __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($tmp$$XMMRegister, $src1$$Register); + __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vextracti64x4 $tmp2,$src2\n\t" + "vpaddq $tmp2,$tmp2,$src2\n\t" + "vextracti128 $tmp,$tmp2\n\t" + "vpaddq $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp2,0xE\n\t" + "vpaddq $tmp2,$tmp2,$tmp\n\t" + "movdq $tmp,$src1\n\t" + "vpaddq $tmp2,$tmp2,$tmp\n\t" + "movdq $dst,$tmp2\t! add reduction8L" %} + ins_encode %{ + __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister); + __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1); + __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE); + __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($tmp$$XMMRegister, $src1$$Register); + __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#endif + instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ predicate(UseSSE >= 1 && UseAVX == 0); match(Set dst (AddReductionVF src1 src2)); @@ -2772,6 +4292,77 @@ ins_pipe( pipe_slow ); %} +instruct radd16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vaddss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x03\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x1\n\t" + "vaddss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x2\n\t" + "vaddss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x3\n\t" + "vaddss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vaddss $dst,$tmp2,$tmp\t! add reduction16F" %} + ins_encode %{ + __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{ predicate(UseSSE >= 1 && UseAVX == 0); match(Set dst (AddReductionVD src1 src2)); @@ -2819,7 +4410,46 @@ __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); - __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{ + predicate(UseAVX > 2); + match(Set dst (AddReductionVD src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vaddsd $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vaddsd $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x1\n\t" + "vaddsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vaddsd $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x2\n\t" + "vaddsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vaddsd $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x3\n\t" + "vaddsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vaddsd $dst,$tmp2,$tmp\t! add reduction8D" %} + ins_encode %{ + __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3); __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); @@ -2850,16 +4480,17 @@ predicate(UseAVX > 0); match(Set dst (MulReductionVI src1 src2)); effect(TEMP tmp, TEMP tmp2); - format %{ "pshufd $tmp2,$src2,0x1\n\t" - "vpmulld $tmp,$src2,$tmp2\n\t" - "movd $tmp2,$src1\n\t" - "vpmulld $tmp2,$tmp,$tmp2\n\t" - "movd $dst,$tmp2\t! mul reduction2I" %} + format %{ "pshufd $tmp2,$src2,0x1\n\t" + "vpmulld $tmp,$src2,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction2I" %} ins_encode %{ + int vector_len = 0; __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1); - __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($tmp2$$XMMRegister, $src1$$Register); - __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($dst$$Register, $tmp2$$XMMRegister); %} ins_pipe( pipe_slow ); @@ -2892,20 +4523,21 @@ predicate(UseAVX > 0); match(Set dst (MulReductionVI src1 src2)); effect(TEMP tmp, TEMP tmp2); - format %{ "pshufd $tmp2,$src2,0xE\n\t" - "vpmulld $tmp,$src2,$tmp2\n\t" - "pshufd $tmp2,$tmp,0x1\n\t" - "vpmulld $tmp,$tmp,$tmp2\n\t" - "movd $tmp2,$src1\n\t" - "vpmulld $tmp2,$tmp,$tmp2\n\t" - "movd $dst,$tmp2\t! mul reduction4I" %} + format %{ "pshufd $tmp2,$src2,0xE\n\t" + "vpmulld $tmp,$src2,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction4I" %} ins_encode %{ + int vector_len = 0; __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); - __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); - __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($tmp2$$XMMRegister, $src1$$Register); - __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); __ movdl($dst$$Register, $tmp2$$XMMRegister); %} ins_pipe( pipe_slow ); @@ -2915,30 +4547,133 @@ predicate(UseAVX > 0); match(Set dst (MulReductionVI src1 src2)); effect(TEMP tmp, TEMP tmp2); - format %{ "vextractf128 $tmp,$src2\n\t" - "vpmulld $tmp,$tmp,$src2\n\t" - "pshufd $tmp2,$tmp,0xE\n\t" - "vpmulld $tmp,$tmp,$tmp2\n\t" - "pshufd $tmp2,$tmp,0x1\n\t" - "vpmulld $tmp,$tmp,$tmp2\n\t" - "movd $tmp2,$src1\n\t" - "vpmulld $tmp2,$tmp,$tmp2\n\t" - "movd $dst,$tmp2\t! mul reduction8I" %} - ins_encode %{ - __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister); - __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false); + format %{ "vextracti128 $tmp,$src2\n\t" + "vpmulld $tmp,$tmp,$src2\n\t" + "pshufd $tmp2,$tmp,0xE\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction8I" %} + ins_encode %{ + int vector_len = 0; + __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{ + predicate(UseAVX > 2); + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vextracti64x4 $tmp3,$src2\n\t" + "vpmulld $tmp3,$tmp3,$src2\n\t" + "vextracti128 $tmp,$tmp3\n\t" + "vpmulld $tmp,$tmp,$src2\n\t" + "pshufd $tmp2,$tmp,0xE\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction16I" %} + ins_encode %{ + __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1); + __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0); __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE); - __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); - __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); __ movdl($tmp2$$XMMRegister, $src1$$Register); - __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); __ movdl($dst$$Register, $tmp2$$XMMRegister); %} ins_pipe( pipe_slow ); %} -instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ +#ifdef _LP64 +instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2 && VM_Version::supports_avx512dq()); + match(Set dst (MulReductionVL src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0xE\n\t" + "vpmullq $tmp,$src2,$tmp2\n\t" + "movdq $tmp2,$src1\n\t" + "vpmullq $tmp2,$tmp,$tmp2\n\t" + "movdq $dst,$tmp2\t! mul reduction2L" %} + ins_encode %{ + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0); + __ movdq($tmp2$$XMMRegister, $src1$$Register); + __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0); + __ movdq($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2 && VM_Version::supports_avx512dq()); + match(Set dst (MulReductionVL src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vextracti64x2 $tmp,$src2, 0x1\n\t" + "vpmullq $tmp2,$tmp,$src2\n\t" + "pshufd $tmp,$tmp2,0xE\n\t" + "vpmullq $tmp2,$tmp2,$tmp\n\t" + "movdq $tmp,$src1\n\t" + "vpmullq $tmp2,$tmp2,$tmp\n\t" + "movdq $dst,$tmp2\t! mul reduction4L" %} + ins_encode %{ + __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0); + __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE); + __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($tmp$$XMMRegister, $src1$$Register); + __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 2 && VM_Version::supports_avx512dq()); + match(Set dst (MulReductionVL src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vextracti64x4 $tmp2,$src2\n\t" + "vpmullq $tmp2,$tmp2,$src2\n\t" + "vextracti128 $tmp,$tmp2\n\t" + "vpmullq $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp2,0xE\n\t" + "vpmullq $tmp2,$tmp2,$tmp\n\t" + "movdq $tmp,$src1\n\t" + "vpmullq $tmp2,$tmp2,$tmp\n\t" + "movdq $dst,$tmp2\t! mul reduction8L" %} + ins_encode %{ + __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister); + __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1); + __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE); + __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($tmp$$XMMRegister, $src1$$Register); + __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0); + __ movdq($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#endif + +instruct rsmul2F_reduction(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ predicate(UseSSE >= 1 && UseAVX == 0); match(Set dst (MulReductionVF src1 src2)); effect(TEMP tmp, TEMP tmp2); @@ -2946,7 +4681,7 @@ "mulss $tmp,$src2\n\t" "pshufd $tmp2,$src2,0x01\n\t" "mulss $tmp,$tmp2\n\t" - "movdqu $dst,$tmp\t! add reduction2F" %} + "movdqu $dst,$tmp\t! mul reduction2F" %} ins_encode %{ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); __ mulss($tmp$$XMMRegister, $src2$$XMMRegister); @@ -2963,7 +4698,7 @@ effect(TEMP tmp, TEMP tmp2); format %{ "vmulss $tmp2,$src1,$src2\n\t" "pshufd $tmp,$src2,0x01\n\t" - "vmulss $dst,$tmp2,$tmp\t! add reduction2F" %} + "vmulss $dst,$tmp2,$tmp\t! mul reduction2F" %} ins_encode %{ __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); @@ -2984,7 +4719,7 @@ "mulss $tmp,$tmp2\n\t" "pshufd $tmp2,$src2,0x03\n\t" "mulss $tmp,$tmp2\n\t" - "movdqu $dst,$tmp\t! add reduction4F" %} + "movdqu $dst,$tmp\t! mul reduction4F" %} ins_encode %{ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); __ mulss($tmp$$XMMRegister, $src2$$XMMRegister); @@ -3009,7 +4744,7 @@ "pshufd $tmp,$src2,0x02\n\t" "vmulss $tmp2,$tmp2,$tmp\n\t" "pshufd $tmp,$src2,0x03\n\t" - "vmulss $dst,$tmp2,$tmp\t! add reduction4F" %} + "vmulss $dst,$tmp2,$tmp\t! mul reduction4F" %} ins_encode %{ __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); @@ -3061,6 +4796,77 @@ ins_pipe( pipe_slow ); %} +instruct rvmul16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{ + predicate(UseAVX > 2); + match(Set dst (MulReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vmulss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x03\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "vextractf32x4 $tmp3,$src2, 0x1\n\t" + "vmulss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "vextractf32x4 $tmp3,$src2, 0x2\n\t" + "vmulss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "vextractf32x4 $tmp3,$src2, 0x3\n\t" + "vmulss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vmulss $dst,$tmp2,$tmp\t! mul reduction16F" %} + ins_encode %{ + __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{ predicate(UseSSE >= 1 && UseAVX == 0); match(Set dst (MulReductionVD src1 src2)); @@ -3068,7 +4874,7 @@ format %{ "movdqu $tmp,$src1\n\t" "mulsd $tmp,$src2\n\t" "pshufd $dst,$src2,0xE\n\t" - "mulsd $dst,$tmp\t! add reduction2D" %} + "mulsd $dst,$tmp\t! mul reduction2D" %} ins_encode %{ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister); @@ -3116,6 +4922,45 @@ ins_pipe( pipe_slow ); %} +instruct rvmul8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{ + predicate(UseAVX > 2); + match(Set dst (MulReductionVD src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vmulsd $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vmulsd $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x1\n\t" + "vmulsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vmulsd $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x2\n\t" + "vmulsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vmulsd $tmp2,$tmp2,$tmp\n\t" + "vextractf64x2 $tmp3,$src2, 0x3\n\t" + "vmulsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vmulsd $dst,$tmp2,$tmp\t! mul reduction8D" %} + ins_encode %{ + __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // ====================VECTOR ARITHMETIC======================================= // --------------------------------- ADD -------------------------------------- @@ -3136,8 +4981,8 @@ match(Set dst (AddVB src1 src2)); format %{ "vpaddb $dst,$src1,$src2\t! add packed4B" %} ins_encode %{ - bool vector256 = false; - __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3157,8 +5002,8 @@ match(Set dst (AddVB src1 src2)); format %{ "vpaddb $dst,$src1,$src2\t! add packed8B" %} ins_encode %{ - bool vector256 = false; - __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3178,8 +5023,8 @@ match(Set dst (AddVB src1 src2)); format %{ "vpaddb $dst,$src1,$src2\t! add packed16B" %} ins_encode %{ - bool vector256 = false; - __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3189,8 +5034,8 @@ match(Set dst (AddVB src (LoadVector mem))); format %{ "vpaddb $dst,$src,$mem\t! add packed16B" %} ins_encode %{ - bool vector256 = false; - __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3200,8 +5045,8 @@ match(Set dst (AddVB src1 src2)); format %{ "vpaddb $dst,$src1,$src2\t! add packed32B" %} ins_encode %{ - bool vector256 = true; - __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3211,8 +5056,30 @@ match(Set dst (AddVB src (LoadVector mem))); format %{ "vpaddb $dst,$src,$mem\t! add packed32B" %} ins_encode %{ - bool vector256 = true; - __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (AddVB src1 src2)); + format %{ "vpaddb $dst,$src1,$src2\t! add packed64B" %} + ins_encode %{ + int vector_len = 2; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (AddVB src (LoadVector mem))); + format %{ "vpaddb $dst,$src,$mem\t! add packed64B" %} + ins_encode %{ + int vector_len = 2; + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3233,8 +5100,8 @@ match(Set dst (AddVS src1 src2)); format %{ "vpaddw $dst,$src1,$src2\t! add packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3254,8 +5121,8 @@ match(Set dst (AddVS src1 src2)); format %{ "vpaddw $dst,$src1,$src2\t! add packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3275,8 +5142,8 @@ match(Set dst (AddVS src1 src2)); format %{ "vpaddw $dst,$src1,$src2\t! add packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3286,8 +5153,8 @@ match(Set dst (AddVS src (LoadVector mem))); format %{ "vpaddw $dst,$src,$mem\t! add packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3297,8 +5164,8 @@ match(Set dst (AddVS src1 src2)); format %{ "vpaddw $dst,$src1,$src2\t! add packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3308,8 +5175,30 @@ match(Set dst (AddVS src (LoadVector mem))); format %{ "vpaddw $dst,$src,$mem\t! add packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (AddVS src1 src2)); + format %{ "vpaddw $dst,$src1,$src2\t! add packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (AddVS src (LoadVector mem))); + format %{ "vpaddw $dst,$src,$mem\t! add packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3330,8 +5219,8 @@ match(Set dst (AddVI src1 src2)); format %{ "vpaddd $dst,$src1,$src2\t! add packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3351,8 +5240,8 @@ match(Set dst (AddVI src1 src2)); format %{ "vpaddd $dst,$src1,$src2\t! add packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3362,8 +5251,8 @@ match(Set dst (AddVI src (LoadVector mem))); format %{ "vpaddd $dst,$src,$mem\t! add packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3373,8 +5262,8 @@ match(Set dst (AddVI src1 src2)); format %{ "vpaddd $dst,$src1,$src2\t! add packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3384,8 +5273,30 @@ match(Set dst (AddVI src (LoadVector mem))); format %{ "vpaddd $dst,$src,$mem\t! add packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AddVI src1 src2)); + format %{ "vpaddd $dst,$src1,$src2\t! add packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AddVI src (LoadVector mem))); + format %{ "vpaddd $dst,$src,$mem\t! add packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3406,8 +5317,8 @@ match(Set dst (AddVL src1 src2)); format %{ "vpaddq $dst,$src1,$src2\t! add packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3417,8 +5328,8 @@ match(Set dst (AddVL src (LoadVector mem))); format %{ "vpaddq $dst,$src,$mem\t! add packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3428,8 +5339,8 @@ match(Set dst (AddVL src1 src2)); format %{ "vpaddq $dst,$src1,$src2\t! add packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3439,8 +5350,30 @@ match(Set dst (AddVL src (LoadVector mem))); format %{ "vpaddq $dst,$src,$mem\t! add packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AddVL src1 src2)); + format %{ "vpaddq $dst,$src1,$src2\t! add packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AddVL src (LoadVector mem))); + format %{ "vpaddq $dst,$src,$mem\t! add packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3461,8 +5394,8 @@ match(Set dst (AddVF src1 src2)); format %{ "vaddps $dst,$src1,$src2\t! add packed2F" %} ins_encode %{ - bool vector256 = false; - __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3482,8 +5415,8 @@ match(Set dst (AddVF src1 src2)); format %{ "vaddps $dst,$src1,$src2\t! add packed4F" %} ins_encode %{ - bool vector256 = false; - __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3493,8 +5426,8 @@ match(Set dst (AddVF src (LoadVector mem))); format %{ "vaddps $dst,$src,$mem\t! add packed4F" %} ins_encode %{ - bool vector256 = false; - __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3504,8 +5437,8 @@ match(Set dst (AddVF src1 src2)); format %{ "vaddps $dst,$src1,$src2\t! add packed8F" %} ins_encode %{ - bool vector256 = true; - __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3515,8 +5448,30 @@ match(Set dst (AddVF src (LoadVector mem))); format %{ "vaddps $dst,$src,$mem\t! add packed8F" %} ins_encode %{ - bool vector256 = true; - __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AddVF src1 src2)); + format %{ "vaddps $dst,$src1,$src2\t! add packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AddVF src (LoadVector mem))); + format %{ "vaddps $dst,$src,$mem\t! add packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3537,8 +5492,8 @@ match(Set dst (AddVD src1 src2)); format %{ "vaddpd $dst,$src1,$src2\t! add packed2D" %} ins_encode %{ - bool vector256 = false; - __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3548,8 +5503,8 @@ match(Set dst (AddVD src (LoadVector mem))); format %{ "vaddpd $dst,$src,$mem\t! add packed2D" %} ins_encode %{ - bool vector256 = false; - __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3559,8 +5514,8 @@ match(Set dst (AddVD src1 src2)); format %{ "vaddpd $dst,$src1,$src2\t! add packed4D" %} ins_encode %{ - bool vector256 = true; - __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3570,8 +5525,30 @@ match(Set dst (AddVD src (LoadVector mem))); format %{ "vaddpd $dst,$src,$mem\t! add packed4D" %} ins_encode %{ - bool vector256 = true; - __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AddVD src1 src2)); + format %{ "vaddpd $dst,$src1,$src2\t! add packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AddVD src (LoadVector mem))); + format %{ "vaddpd $dst,$src,$mem\t! add packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3594,8 +5571,8 @@ match(Set dst (SubVB src1 src2)); format %{ "vpsubb $dst,$src1,$src2\t! sub packed4B" %} ins_encode %{ - bool vector256 = false; - __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3615,8 +5592,8 @@ match(Set dst (SubVB src1 src2)); format %{ "vpsubb $dst,$src1,$src2\t! sub packed8B" %} ins_encode %{ - bool vector256 = false; - __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3636,8 +5613,8 @@ match(Set dst (SubVB src1 src2)); format %{ "vpsubb $dst,$src1,$src2\t! sub packed16B" %} ins_encode %{ - bool vector256 = false; - __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3647,8 +5624,8 @@ match(Set dst (SubVB src (LoadVector mem))); format %{ "vpsubb $dst,$src,$mem\t! sub packed16B" %} ins_encode %{ - bool vector256 = false; - __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3658,8 +5635,8 @@ match(Set dst (SubVB src1 src2)); format %{ "vpsubb $dst,$src1,$src2\t! sub packed32B" %} ins_encode %{ - bool vector256 = true; - __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3669,8 +5646,30 @@ match(Set dst (SubVB src (LoadVector mem))); format %{ "vpsubb $dst,$src,$mem\t! sub packed32B" %} ins_encode %{ - bool vector256 = true; - __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (SubVB src1 src2)); + format %{ "vpsubb $dst,$src1,$src2\t! sub packed64B" %} + ins_encode %{ + int vector_len = 2; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (SubVB src (LoadVector mem))); + format %{ "vpsubb $dst,$src,$mem\t! sub packed64B" %} + ins_encode %{ + int vector_len = 2; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3691,8 +5690,8 @@ match(Set dst (SubVS src1 src2)); format %{ "vpsubw $dst,$src1,$src2\t! sub packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3712,8 +5711,8 @@ match(Set dst (SubVS src1 src2)); format %{ "vpsubw $dst,$src1,$src2\t! sub packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3733,8 +5732,8 @@ match(Set dst (SubVS src1 src2)); format %{ "vpsubw $dst,$src1,$src2\t! sub packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3744,8 +5743,8 @@ match(Set dst (SubVS src (LoadVector mem))); format %{ "vpsubw $dst,$src,$mem\t! sub packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3755,8 +5754,8 @@ match(Set dst (SubVS src1 src2)); format %{ "vpsubw $dst,$src1,$src2\t! sub packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3766,8 +5765,30 @@ match(Set dst (SubVS src (LoadVector mem))); format %{ "vpsubw $dst,$src,$mem\t! sub packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (SubVS src1 src2)); + format %{ "vpsubw $dst,$src1,$src2\t! sub packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (SubVS src (LoadVector mem))); + format %{ "vpsubw $dst,$src,$mem\t! sub packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3788,8 +5809,8 @@ match(Set dst (SubVI src1 src2)); format %{ "vpsubd $dst,$src1,$src2\t! sub packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3809,8 +5830,8 @@ match(Set dst (SubVI src1 src2)); format %{ "vpsubd $dst,$src1,$src2\t! sub packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3820,8 +5841,8 @@ match(Set dst (SubVI src (LoadVector mem))); format %{ "vpsubd $dst,$src,$mem\t! sub packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3831,8 +5852,8 @@ match(Set dst (SubVI src1 src2)); format %{ "vpsubd $dst,$src1,$src2\t! sub packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3842,8 +5863,30 @@ match(Set dst (SubVI src (LoadVector mem))); format %{ "vpsubd $dst,$src,$mem\t! sub packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (SubVI src1 src2)); + format %{ "vpsubd $dst,$src1,$src2\t! sub packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (SubVI src (LoadVector mem))); + format %{ "vpsubd $dst,$src,$mem\t! sub packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3864,8 +5907,8 @@ match(Set dst (SubVL src1 src2)); format %{ "vpsubq $dst,$src1,$src2\t! sub packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3875,8 +5918,8 @@ match(Set dst (SubVL src (LoadVector mem))); format %{ "vpsubq $dst,$src,$mem\t! sub packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3886,8 +5929,8 @@ match(Set dst (SubVL src1 src2)); format %{ "vpsubq $dst,$src1,$src2\t! sub packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3895,10 +5938,32 @@ instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{ predicate(UseAVX > 1 && n->as_Vector()->length() == 4); match(Set dst (SubVL src (LoadVector mem))); - format %{ "vpsubq $dst,$src,$mem\t! sub packed4L" %} + format %{ "vpsubq $dst,$src,$mem\t! sub packed4L" %} + ins_encode %{ + int vector_len = 1; + __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (SubVL src1 src2)); + format %{ "vpsubq $dst,$src1,$src2\t! sub packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (SubVL src (LoadVector mem))); + format %{ "vpsubq $dst,$src,$mem\t! sub packed8L" %} ins_encode %{ - bool vector256 = true; - __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 2; + __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3919,8 +5984,8 @@ match(Set dst (SubVF src1 src2)); format %{ "vsubps $dst,$src1,$src2\t! sub packed2F" %} ins_encode %{ - bool vector256 = false; - __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3940,8 +6005,8 @@ match(Set dst (SubVF src1 src2)); format %{ "vsubps $dst,$src1,$src2\t! sub packed4F" %} ins_encode %{ - bool vector256 = false; - __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3951,8 +6016,8 @@ match(Set dst (SubVF src (LoadVector mem))); format %{ "vsubps $dst,$src,$mem\t! sub packed4F" %} ins_encode %{ - bool vector256 = false; - __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3962,8 +6027,8 @@ match(Set dst (SubVF src1 src2)); format %{ "vsubps $dst,$src1,$src2\t! sub packed8F" %} ins_encode %{ - bool vector256 = true; - __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3973,8 +6038,30 @@ match(Set dst (SubVF src (LoadVector mem))); format %{ "vsubps $dst,$src,$mem\t! sub packed8F" %} ins_encode %{ - bool vector256 = true; - __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (SubVF src1 src2)); + format %{ "vsubps $dst,$src1,$src2\t! sub packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (SubVF src (LoadVector mem))); + format %{ "vsubps $dst,$src,$mem\t! sub packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3995,8 +6082,8 @@ match(Set dst (SubVD src1 src2)); format %{ "vsubpd $dst,$src1,$src2\t! sub packed2D" %} ins_encode %{ - bool vector256 = false; - __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4006,8 +6093,8 @@ match(Set dst (SubVD src (LoadVector mem))); format %{ "vsubpd $dst,$src,$mem\t! sub packed2D" %} ins_encode %{ - bool vector256 = false; - __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4017,8 +6104,8 @@ match(Set dst (SubVD src1 src2)); format %{ "vsubpd $dst,$src1,$src2\t! sub packed4D" %} ins_encode %{ - bool vector256 = true; - __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4028,8 +6115,30 @@ match(Set dst (SubVD src (LoadVector mem))); format %{ "vsubpd $dst,$src,$mem\t! sub packed4D" %} ins_encode %{ - bool vector256 = true; - __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (SubVD src1 src2)); + format %{ "vsubpd $dst,$src1,$src2\t! sub packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (SubVD src (LoadVector mem))); + format %{ "vsubpd $dst,$src,$mem\t! sub packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4052,8 +6161,8 @@ match(Set dst (MulVS src1 src2)); format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4073,8 +6182,8 @@ match(Set dst (MulVS src1 src2)); format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4094,8 +6203,8 @@ match(Set dst (MulVS src1 src2)); format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4105,8 +6214,8 @@ match(Set dst (MulVS src (LoadVector mem))); format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4116,8 +6225,8 @@ match(Set dst (MulVS src1 src2)); format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4127,8 +6236,30 @@ match(Set dst (MulVS src (LoadVector mem))); format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (MulVS src1 src2)); + format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (MulVS src (LoadVector mem))); + format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4149,8 +6280,19 @@ match(Set dst (MulVI src1 src2)); format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src1 src2)); + format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %} + ins_encode %{ + int vector_len = 0; + __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4170,8 +6312,8 @@ match(Set dst (MulVI src1 src2)); format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4181,8 +6323,30 @@ match(Set dst (MulVI src (LoadVector mem))); format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src1 src2)); + format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %} + ins_encode %{ + int vector_len = 1; + __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src (LoadVector mem))); + format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %} + ins_encode %{ + int vector_len = 1; + __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4192,8 +6356,30 @@ match(Set dst (MulVI src1 src2)); format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src1 src2)); + format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (MulVI src1 src2)); + format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4203,8 +6389,30 @@ match(Set dst (MulVI src (LoadVector mem))); format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src (LoadVector mem))); + format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (MulVI src (LoadVector mem))); + format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4225,8 +6433,8 @@ match(Set dst (MulVF src1 src2)); format %{ "vmulps $dst,$src1,$src2\t! mul packed2F" %} ins_encode %{ - bool vector256 = false; - __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4246,8 +6454,8 @@ match(Set dst (MulVF src1 src2)); format %{ "vmulps $dst,$src1,$src2\t! mul packed4F" %} ins_encode %{ - bool vector256 = false; - __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4257,8 +6465,8 @@ match(Set dst (MulVF src (LoadVector mem))); format %{ "vmulps $dst,$src,$mem\t! mul packed4F" %} ins_encode %{ - bool vector256 = false; - __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4268,8 +6476,8 @@ match(Set dst (MulVF src1 src2)); format %{ "vmulps $dst,$src1,$src2\t! mul packed8F" %} ins_encode %{ - bool vector256 = true; - __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4279,8 +6487,30 @@ match(Set dst (MulVF src (LoadVector mem))); format %{ "vmulps $dst,$src,$mem\t! mul packed8F" %} ins_encode %{ - bool vector256 = true; - __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (MulVF src1 src2)); + format %{ "vmulps $dst,$src1,$src2\t! mul packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (MulVF src (LoadVector mem))); + format %{ "vmulps $dst,$src,$mem\t! mul packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4301,8 +6531,8 @@ match(Set dst (MulVD src1 src2)); format %{ "vmulpd $dst,$src1,$src2\t! mul packed2D" %} ins_encode %{ - bool vector256 = false; - __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4312,8 +6542,8 @@ match(Set dst (MulVD src (LoadVector mem))); format %{ "vmulpd $dst,$src,$mem\t! mul packed2D" %} ins_encode %{ - bool vector256 = false; - __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4323,8 +6553,8 @@ match(Set dst (MulVD src1 src2)); format %{ "vmulpd $dst,$src1,$src2\t! mul packed4D" %} ins_encode %{ - bool vector256 = true; - __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4334,8 +6564,30 @@ match(Set dst (MulVD src (LoadVector mem))); format %{ "vmulpd $dst,$src,$mem\t! mul packed4D" %} ins_encode %{ - bool vector256 = true; - __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (MulVD src1 src2)); + format %{ "vmulpd $dst k0,$src1,$src2\t! mul packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (MulVD src (LoadVector mem))); + format %{ "vmulpd $dst k0,$src,$mem\t! mul packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4358,8 +6610,8 @@ match(Set dst (DivVF src1 src2)); format %{ "vdivps $dst,$src1,$src2\t! div packed2F" %} ins_encode %{ - bool vector256 = false; - __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4379,8 +6631,8 @@ match(Set dst (DivVF src1 src2)); format %{ "vdivps $dst,$src1,$src2\t! div packed4F" %} ins_encode %{ - bool vector256 = false; - __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4390,8 +6642,8 @@ match(Set dst (DivVF src (LoadVector mem))); format %{ "vdivps $dst,$src,$mem\t! div packed4F" %} ins_encode %{ - bool vector256 = false; - __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4401,8 +6653,8 @@ match(Set dst (DivVF src1 src2)); format %{ "vdivps $dst,$src1,$src2\t! div packed8F" %} ins_encode %{ - bool vector256 = true; - __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4412,8 +6664,30 @@ match(Set dst (DivVF src (LoadVector mem))); format %{ "vdivps $dst,$src,$mem\t! div packed8F" %} ins_encode %{ - bool vector256 = true; - __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 16); + match(Set dst (DivVF src1 src2)); + format %{ "vdivps $dst,$src1,$src2\t! div packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 16); + match(Set dst (DivVF src (LoadVector mem))); + format %{ "vdivps $dst,$src,$mem\t! div packed16F" %} + ins_encode %{ + int vector_len = 2; + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4434,8 +6708,8 @@ match(Set dst (DivVD src1 src2)); format %{ "vdivpd $dst,$src1,$src2\t! div packed2D" %} ins_encode %{ - bool vector256 = false; - __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4445,8 +6719,8 @@ match(Set dst (DivVD src (LoadVector mem))); format %{ "vdivpd $dst,$src,$mem\t! div packed2D" %} ins_encode %{ - bool vector256 = false; - __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4456,8 +6730,8 @@ match(Set dst (DivVD src1 src2)); format %{ "vdivpd $dst,$src1,$src2\t! div packed4D" %} ins_encode %{ - bool vector256 = true; - __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4467,8 +6741,30 @@ match(Set dst (DivVD src (LoadVector mem))); format %{ "vdivpd $dst,$src,$mem\t! div packed4D" %} ins_encode %{ - bool vector256 = true; - __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (DivVD src1 src2)); + format %{ "vdivpd $dst,$src1,$src2\t! div packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (DivVD src (LoadVector mem))); + format %{ "vdivpd $dst,$src,$mem\t! div packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4515,8 +6811,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4526,8 +6822,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4557,8 +6853,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4568,8 +6864,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4599,8 +6895,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4610,8 +6906,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4621,8 +6917,8 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4632,8 +6928,30 @@ match(Set dst (LShiftVS src shift)); format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4664,8 +6982,8 @@ match(Set dst (LShiftVI src shift)); format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4675,8 +6993,8 @@ match(Set dst (LShiftVI src shift)); format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4706,8 +7024,8 @@ match(Set dst (LShiftVI src shift)); format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4717,8 +7035,8 @@ match(Set dst (LShiftVI src shift)); format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4728,8 +7046,8 @@ match(Set dst (LShiftVI src shift)); format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4739,8 +7057,30 @@ match(Set dst (LShiftVI src shift)); format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4771,8 +7111,8 @@ match(Set dst (LShiftVL src shift)); format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4782,8 +7122,8 @@ match(Set dst (LShiftVL src shift)); format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4793,8 +7133,8 @@ match(Set dst (LShiftVL src shift)); format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4804,8 +7144,30 @@ match(Set dst (LShiftVL src shift)); format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4842,8 +7204,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4853,8 +7215,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4884,8 +7246,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4895,8 +7257,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4926,8 +7288,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4937,8 +7299,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4948,8 +7310,8 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4959,8 +7321,30 @@ match(Set dst (URShiftVS src shift)); format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4991,8 +7375,8 @@ match(Set dst (URShiftVI src shift)); format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5002,8 +7386,8 @@ match(Set dst (URShiftVI src shift)); format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5033,8 +7417,8 @@ match(Set dst (URShiftVI src shift)); format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5044,8 +7428,8 @@ match(Set dst (URShiftVI src shift)); format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5055,8 +7439,8 @@ match(Set dst (URShiftVI src shift)); format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5066,8 +7450,30 @@ match(Set dst (URShiftVI src shift)); format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5098,8 +7504,8 @@ match(Set dst (URShiftVL src shift)); format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5109,8 +7515,8 @@ match(Set dst (URShiftVL src shift)); format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} ins_encode %{ - bool vector256 = false; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5120,8 +7526,8 @@ match(Set dst (URShiftVL src shift)); format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5131,8 +7537,30 @@ match(Set dst (URShiftVL src shift)); format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} ins_encode %{ - bool vector256 = true; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (URShiftVL src shift)); + format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (URShiftVL src shift)); + format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed8L" %} + ins_encode %{ + int vector_len = 2; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5165,8 +7593,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5176,8 +7604,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} ins_encode %{ - bool vector256 = false; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5207,8 +7635,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5218,8 +7646,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} ins_encode %{ - bool vector256 = false; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5249,8 +7677,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5260,8 +7688,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} ins_encode %{ - bool vector256 = false; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5271,8 +7699,8 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5282,8 +7710,30 @@ match(Set dst (RShiftVS src shift)); format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} ins_encode %{ - bool vector256 = true; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5314,8 +7764,8 @@ match(Set dst (RShiftVI src shift)); format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5325,8 +7775,8 @@ match(Set dst (RShiftVI src shift)); format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} ins_encode %{ - bool vector256 = false; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5356,8 +7806,8 @@ match(Set dst (RShiftVI src shift)); format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 0; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5367,8 +7817,8 @@ match(Set dst (RShiftVI src shift)); format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} ins_encode %{ - bool vector256 = false; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 0; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5378,8 +7828,8 @@ match(Set dst (RShiftVI src shift)); format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + int vector_len = 1; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5389,8 +7839,30 @@ match(Set dst (RShiftVI src shift)); format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} ins_encode %{ - bool vector256 = true; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + int vector_len = 1; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5415,8 +7887,8 @@ match(Set dst (AndV src1 src2)); format %{ "vpand $dst,$src1,$src2\t! and vectors (4 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5436,8 +7908,8 @@ match(Set dst (AndV src1 src2)); format %{ "vpand $dst,$src1,$src2\t! and vectors (8 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5457,8 +7929,8 @@ match(Set dst (AndV src1 src2)); format %{ "vpand $dst,$src1,$src2\t! and vectors (16 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5468,8 +7940,8 @@ match(Set dst (AndV src (LoadVector mem))); format %{ "vpand $dst,$src,$mem\t! and vectors (16 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5479,8 +7951,8 @@ match(Set dst (AndV src1 src2)); format %{ "vpand $dst,$src1,$src2\t! and vectors (32 bytes)" %} ins_encode %{ - bool vector256 = true; - __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5490,8 +7962,30 @@ match(Set dst (AndV src (LoadVector mem))); format %{ "vpand $dst,$src,$mem\t! and vectors (32 bytes)" %} ins_encode %{ - bool vector256 = true; - __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64); + match(Set dst (AndV src (LoadVector mem))); + format %{ "vpand $dst,$src,$mem\t! and vectors (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5513,8 +8007,8 @@ match(Set dst (OrV src1 src2)); format %{ "vpor $dst,$src1,$src2\t! or vectors (4 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5534,8 +8028,8 @@ match(Set dst (OrV src1 src2)); format %{ "vpor $dst,$src1,$src2\t! or vectors (8 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5555,8 +8049,8 @@ match(Set dst (OrV src1 src2)); format %{ "vpor $dst,$src1,$src2\t! or vectors (16 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5566,8 +8060,8 @@ match(Set dst (OrV src (LoadVector mem))); format %{ "vpor $dst,$src,$mem\t! or vectors (16 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5577,8 +8071,8 @@ match(Set dst (OrV src1 src2)); format %{ "vpor $dst,$src1,$src2\t! or vectors (32 bytes)" %} ins_encode %{ - bool vector256 = true; - __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5588,8 +8082,30 @@ match(Set dst (OrV src (LoadVector mem))); format %{ "vpor $dst,$src,$mem\t! or vectors (32 bytes)" %} ins_encode %{ - bool vector256 = true; - __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64); + match(Set dst (OrV src1 src2)); + format %{ "vpor $dst,$src1,$src2\t! or vectors (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64); + match(Set dst (OrV src (LoadVector mem))); + format %{ "vpor $dst,$src,$mem\t! or vectors (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5611,8 +8127,8 @@ match(Set dst (XorV src1 src2)); format %{ "vpxor $dst,$src1,$src2\t! xor vectors (4 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5632,8 +8148,8 @@ match(Set dst (XorV src1 src2)); format %{ "vpxor $dst,$src1,$src2\t! xor vectors (8 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5653,8 +8169,8 @@ match(Set dst (XorV src1 src2)); format %{ "vpxor $dst,$src1,$src2\t! xor vectors (16 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 0; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5664,8 +8180,8 @@ match(Set dst (XorV src (LoadVector mem))); format %{ "vpxor $dst,$src,$mem\t! xor vectors (16 bytes)" %} ins_encode %{ - bool vector256 = false; - __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 0; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5675,8 +8191,8 @@ match(Set dst (XorV src1 src2)); format %{ "vpxor $dst,$src1,$src2\t! xor vectors (32 bytes)" %} ins_encode %{ - bool vector256 = true; - __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5686,8 +8202,30 @@ match(Set dst (XorV src (LoadVector mem))); format %{ "vpxor $dst,$src,$mem\t! xor vectors (32 bytes)" %} ins_encode %{ - bool vector256 = true; - __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64); + match(Set dst (XorV src1 src2)); + format %{ "vpxor $dst,$src1,$src2\t! xor vectors (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64); + match(Set dst (XorV src (LoadVector mem))); + format %{ "vpxor $dst,$src,$mem\t! xor vectors (64 bytes)" %} + ins_encode %{ + int vector_len = 2; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} --- old/src/cpu/x86/vm/x86_32.ad 2015-04-23 08:26:44.657343300 -0700 +++ new/src/cpu/x86/vm/x86_32.ad 2015-04-23 08:26:44.450343300 -0700 @@ -101,6 +101,17 @@ reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next()); reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()); reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next()); +// +// Empty fill registers, which are never used, but supply alignment to xmm regs +// +reg_def FILL0( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(2)); +reg_def FILL1( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(3)); +reg_def FILL2( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(4)); +reg_def FILL3( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(5)); +reg_def FILL4( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(6)); +reg_def FILL5( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(7)); +reg_def FILL6( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(8)); +reg_def FILL7( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(9)); // Specify priority of register selection within phases of register // allocation. Highest priority is first. A useful heuristic is to @@ -112,7 +123,8 @@ alloc_class chunk0( ECX, EBX, EBP, EDI, EAX, EDX, ESI, ESP, FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H, FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H, - FPR6L, FPR6H, FPR7L, FPR7H ); + FPR6L, FPR6H, FPR7L, FPR7H, + FILL0, FILL1, FILL2, FILL3, FILL4, FILL5, FILL6, FILL7); //----------Architecture Description Register Classes-------------------------- @@ -235,7 +247,9 @@ size += 6; // fldcw } if (C->max_vector_size() > 16) { - size += 3; // vzeroupper + if(UseAVX <= 2) { + size += 3; // vzeroupper + } } return size; } @@ -731,6 +745,12 @@ // Helper for XMM registers. Extra opcode bits, limited syntax. static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg_lo, int reg_hi, int size, outputStream* st ) { + int in_size_in_bits = Assembler::EVEX_32bit; + int evex_encoding = 0; + if (reg_lo+1 == reg_hi) { + in_size_in_bits = Assembler::EVEX_64bit; + evex_encoding = Assembler::VEX_W; + } if (cbuf) { MacroAssembler _masm(cbuf); if (reg_lo+1 == reg_hi) { // double move? @@ -763,7 +783,17 @@ } #endif } - int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4); + bool is_single_byte = false; + if ((UseAVX > 2) && (offset != 0)) { + is_single_byte = Assembler::query_compressed_disp_byte(offset, true, 0, Assembler::EVEX_T1S, in_size_in_bits, evex_encoding); + } + int offset_size = 0; + if (UseAVX > 2 ) { + offset_size = (offset == 0) ? 0 : ((is_single_byte) ? 1 : 4); + } else { + offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4); + } + size += (UseAVX > 2) ? 2 : 0; // Need an additional two bytes for EVEX // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. return size+5+offset_size; } @@ -799,8 +829,8 @@ #endif } // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix. - // Only MOVAPS SSE prefix uses 1 byte. - int sz = 4; + // Only MOVAPS SSE prefix uses 1 byte. EVEX uses an additional 2 bytes. + int sz = (UseAVX > 2) ? 6 : 4; if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) && UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3; return size + sz; @@ -818,7 +848,7 @@ st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]); #endif } - return 4; + return (UseAVX> 2) ? 6 : 4; } @@ -834,7 +864,7 @@ st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]); #endif } - return 4; + return (UseAVX> 2) ? 6 : 4; } static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) { @@ -905,9 +935,8 @@ calc_size += 3+src_offset_size + 3+dst_offset_size; break; case Op_VecX: - calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size; - break; case Op_VecY: + case Op_VecZ: calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size; break; default: @@ -938,6 +967,11 @@ __ vmovdqu(xmm0, Address(rsp, src_offset)); __ vmovdqu(Address(rsp, dst_offset), xmm0); __ vmovdqu(xmm0, Address(rsp, -32)); + case Op_VecZ: + __ evmovdqu(Address(rsp, -64), xmm0, 2); + __ evmovdqu(xmm0, Address(rsp, src_offset), 2); + __ evmovdqu(Address(rsp, dst_offset), xmm0, 2); + __ evmovdqu(xmm0, Address(rsp, -64), 2); break; default: ShouldNotReachHere(); @@ -973,6 +1007,12 @@ "vmovdqu [rsp + #%d], xmm0\n\t" "vmovdqu xmm0, [rsp - #32]", src_offset, dst_offset); + case Op_VecZ: + st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t" + "vmovdqu xmm0, [rsp + #%d]\n\t" + "vmovdqu [rsp + #%d], xmm0\n\t" + "vmovdqu xmm0, [rsp - #64]", + src_offset, dst_offset); break; default: ShouldNotReachHere(); @@ -1006,7 +1046,7 @@ uint ireg = ideal_reg(); assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity"); assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity"); - assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity"); + assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity"); if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) { // mem -> mem int src_offset = ra_->reg2offset(src_first); @@ -3969,7 +4009,7 @@ // XMM Float register operands operand regF() %{ predicate( UseSSE>=1 ); - constraint(ALLOC_IN_RC(float_reg)); + constraint(ALLOC_IN_RC(float_reg_legacy)); match(RegF); format %{ %} interface(REG_INTER); @@ -3978,12 +4018,44 @@ // XMM Double register operands operand regD() %{ predicate( UseSSE>=2 ); - constraint(ALLOC_IN_RC(double_reg)); + constraint(ALLOC_IN_RC(double_reg_legacy)); match(RegD); format %{ %} interface(REG_INTER); %} +// Vectors +operand vecS() %{ + constraint(ALLOC_IN_RC(vectors_reg_legacy)); + match(VecS); + + format %{ %} + interface(REG_INTER); +%} + +operand vecD() %{ + constraint(ALLOC_IN_RC(vectord_reg_legacy)); + match(VecD); + + format %{ %} + interface(REG_INTER); +%} + +operand vecX() %{ + constraint(ALLOC_IN_RC(vectorx_reg_legacy)); + match(VecX); + + format %{ %} + interface(REG_INTER); +%} + +operand vecY() %{ + constraint(ALLOC_IN_RC(vectory_reg_legacy)); + match(VecY); + + format %{ %} + interface(REG_INTER); +%} //----------Memory Operands---------------------------------------------------- // Direct Memory Operand @@ -4991,11 +5063,11 @@ match(Set dst (ReverseBytesUS dst)); effect(KILL cr); - format %{ "BSWAP $dst\n\t" + format %{ "BSWAP $dst\n\t" "SHR $dst,16\n\t" %} ins_encode %{ __ bswapl($dst$$Register); - __ shrl($dst$$Register, 16); + __ shrl($dst$$Register, 16); %} ins_pipe( ialu_reg ); %} @@ -5004,11 +5076,11 @@ match(Set dst (ReverseBytesS dst)); effect(KILL cr); - format %{ "BSWAP $dst\n\t" + format %{ "BSWAP $dst\n\t" "SAR $dst,16\n\t" %} ins_encode %{ __ bswapl($dst$$Register); - __ sarl($dst$$Register, 16); + __ sarl($dst$$Register, 16); %} ins_pipe( ialu_reg ); %} @@ -6496,7 +6568,7 @@ effect(KILL cr); ins_cost(400); - format %{ + format %{ $$template if (os::is_MP()) { $$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile" @@ -8259,10 +8331,10 @@ // Xor Register with Immediate -1 instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{ - match(Set dst (XorI dst imm)); + match(Set dst (XorI dst imm)); size(2); - format %{ "NOT $dst" %} + format %{ "NOT $dst" %} ins_encode %{ __ notl($dst$$Register); %} @@ -8910,7 +8982,7 @@ // Xor Long Register with Immediate -1 instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{ - match(Set dst (XorL dst imm)); + match(Set dst (XorL dst imm)); format %{ "NOT $dst.lo\n\t" "NOT $dst.hi" %} ins_encode %{ @@ -8965,7 +9037,7 @@ effect(KILL cr); ins_cost(100); format %{ "ADD $dst.lo,$dst.lo\n\t" - "ADC $dst.hi,$dst.hi\n\t" + "ADC $dst.hi,$dst.hi\n\t" "ADD $dst.lo,$dst.lo\n\t" "ADC $dst.hi,$dst.hi" %} ins_encode %{ @@ -8984,9 +9056,9 @@ effect(KILL cr); ins_cost(100); format %{ "ADD $dst.lo,$dst.lo\n\t" - "ADC $dst.hi,$dst.hi\n\t" + "ADC $dst.hi,$dst.hi\n\t" "ADD $dst.lo,$dst.lo\n\t" - "ADC $dst.hi,$dst.hi\n\t" + "ADC $dst.hi,$dst.hi\n\t" "ADD $dst.lo,$dst.lo\n\t" "ADC $dst.hi,$dst.hi" %} ins_encode %{ @@ -11139,7 +11211,6 @@ ins_pipe( ialu_reg_reg ); %} - instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{ match(Set dst (MoveF2I src)); effect( DEF dst, USE src ); @@ -11371,7 +11442,7 @@ format %{ "XOR EAX,EAX\t# ClearArray:\n\t" "SHL ECX,1\t# Convert doublewords to words\n\t" "REP STOS\t# store EAX into [EDI++] while ECX--" %} - ins_encode %{ + ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); %} ins_pipe( pipe_slow ); @@ -11384,7 +11455,7 @@ format %{ "XOR EAX,EAX\t# ClearArray:\n\t" "SHL ECX,3\t# Convert doublewords to bytes\n\t" "REP STOSB\t# store EAX into [EDI++] while ECX--" %} - ins_encode %{ + ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); %} ins_pipe( pipe_slow ); --- old/src/cpu/x86/vm/x86_64.ad 2015-04-23 08:26:49.455343300 -0700 +++ new/src/cpu/x86/vm/x86_64.ad 2015-04-23 08:26:49.260343300 -0700 @@ -400,7 +400,11 @@ return 3; // rex.w, op, rm(reg/reg) } static int clear_avx_size() { - return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper + if(UseAVX > 2) { + return 0; // vzeroupper is ignored + } else { + return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper + } } // !!!!! Special hack to get all types of calls to specify the byte offset @@ -938,6 +942,11 @@ __ vmovdqu(xmm0, Address(rsp, src_offset)); __ vmovdqu(Address(rsp, dst_offset), xmm0); __ vmovdqu(xmm0, Address(rsp, -32)); + case Op_VecZ: + __ evmovdqu(Address(rsp, -64), xmm0, 2); + __ evmovdqu(xmm0, Address(rsp, src_offset), 2); + __ evmovdqu(Address(rsp, dst_offset), xmm0, 2); + __ evmovdqu(xmm0, Address(rsp, -64), 2); break; default: ShouldNotReachHere(); @@ -971,6 +980,13 @@ "vmovdqu xmm0, [rsp - #32]", src_offset, dst_offset); break; + case Op_VecZ: + st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t" + "vmovdqu xmm0, [rsp + #%d]\n\t" + "vmovdqu [rsp + #%d], xmm0\n\t" + "vmovdqu xmm0, [rsp - #64]", + src_offset, dst_offset); + break; default: ShouldNotReachHere(); } @@ -1004,7 +1020,7 @@ if (bottom_type()->isa_vect() != NULL) { uint ireg = ideal_reg(); assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity"); - assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity"); + assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity"); if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) { // mem -> mem int src_offset = ra_->reg2offset(src_first); @@ -1441,7 +1457,7 @@ return MachNode::size(ra_); // too many variables; just compute it // the hard way } - + //============================================================================= @@ -2699,7 +2715,7 @@ RAX_H_num // Op_RegL }; // Excluded flags and vector registers. - assert(ARRAY_SIZE(hi) == _last_machine_leaf - 5, "missing type"); + assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type"); return OptoRegPair(hi[ideal_reg], lo[ideal_reg]); %} %} @@ -3460,21 +3476,50 @@ interface(REG_INTER); %} -// Float register operands -operand regF() -%{ - constraint(ALLOC_IN_RC(float_reg)); - match(RegF); +operand regF() %{ + constraint(ALLOC_IN_RC(float_reg)); + match(RegF); + + format %{ %} + interface(REG_INTER); +%} + +operand regD() %{ + constraint(ALLOC_IN_RC(double_reg)); + match(RegD); + + format %{ %} + interface(REG_INTER); +%} + +// Vectors +operand vecS() %{ + constraint(ALLOC_IN_RC(vectors_reg)); + match(VecS); format %{ %} interface(REG_INTER); %} -// Double register operands -operand regD() -%{ - constraint(ALLOC_IN_RC(double_reg)); - match(RegD); +operand vecD() %{ + constraint(ALLOC_IN_RC(vectord_reg)); + match(VecD); + + format %{ %} + interface(REG_INTER); +%} + +operand vecX() %{ + constraint(ALLOC_IN_RC(vectorx_reg)); + match(VecX); + + format %{ %} + interface(REG_INTER); +%} + +operand vecY() %{ + constraint(ALLOC_IN_RC(vectory_reg)); + match(VecY); format %{ %} interface(REG_INTER); @@ -4819,7 +4864,7 @@ %} // Load Unsigned Integer into Long Register -instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask) +instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask) %{ match(Set dst (AndL (ConvI2L (LoadI mem)) mask)); @@ -10246,7 +10291,7 @@ format %{ "xorq rax, rax\t# ClearArray:\n\t" "rep stosq\t# Store rax to *rdi++ while rcx--" %} - ins_encode %{ + ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); %} ins_pipe(pipe_slow); @@ -10261,7 +10306,7 @@ format %{ "xorq rax, rax\t# ClearArray:\n\t" "shlq rcx,3\t# Convert doublewords to bytes\n\t" "rep stosb\t# Store rax to *rdi++ while rcx--" %} - ins_encode %{ + ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); %} ins_pipe( pipe_slow ); --- old/src/share/vm/adlc/archDesc.cpp 2015-04-23 08:26:54.025343300 -0700 +++ new/src/share/vm/adlc/archDesc.cpp 2015-04-23 08:26:53.834343300 -0700 @@ -929,6 +929,7 @@ case 'D': return "TypeVect::VECTD"; case 'X': return "TypeVect::VECTX"; case 'Y': return "TypeVect::VECTY"; + case 'Z': return "TypeVect::VECTZ"; default: internal_err("Vector type %s with unrecognized type\n",idealOp); } --- old/src/share/vm/adlc/formssel.cpp 2015-04-23 08:26:58.414343300 -0700 +++ new/src/share/vm/adlc/formssel.cpp 2015-04-23 08:26:58.223343300 -0700 @@ -3919,6 +3919,7 @@ strcmp(opType,"VecD")==0 || strcmp(opType,"VecX")==0 || strcmp(opType,"VecY")==0 || + strcmp(opType,"VecZ")==0 || strcmp(opType,"Reg" )==0) ) { return 1; } @@ -4048,6 +4049,7 @@ strcmp(opType,"AddReductionVF")==0 || strcmp(opType,"AddReductionVD")==0 || strcmp(opType,"MulReductionVI")==0 || + strcmp(opType,"MulReductionVL")==0 || strcmp(opType,"MulReductionVF")==0 || strcmp(opType,"MulReductionVD")==0 || 0 /* 0 to line up columns nicely */ ) @@ -4139,12 +4141,12 @@ static const char *vector_list[] = { "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD", "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD", - "MulVS","MulVI","MulVF","MulVD", + "MulVS","MulVI","MulVL","MulVF","MulVD", "DivVF","DivVD", "AndV" ,"XorV" ,"OrV", "AddReductionVI", "AddReductionVL", "AddReductionVF", "AddReductionVD", - "MulReductionVI", + "MulReductionVI", "MulReductionVL", "MulReductionVF", "MulReductionVD", "LShiftCntV","RShiftCntV", "LShiftVB","LShiftVS","LShiftVI","LShiftVL", --- old/src/share/vm/c1/c1_LinearScan.cpp 2015-04-23 08:27:02.884343300 -0700 +++ new/src/share/vm/c1/c1_LinearScan.cpp 2015-04-23 08:27:02.692343300 -0700 @@ -1290,7 +1290,8 @@ #ifdef X86 } if (UseSSE > 0) { - for (i = 0; i < FrameMap::nof_caller_save_xmm_regs; i++) { + int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms(); + for (i = 0; i < num_caller_save_xmm_regs; i ++) { LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(i); assert(opr->is_valid() && opr->is_register(), "FrameMap should not return invalid operands"); assert(reg_numHi(opr) == -1, "missing addition of range for hi-register"); @@ -2098,7 +2099,13 @@ case T_FLOAT: { #ifdef X86 if (UseSSE >= 1) { - assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register"); + int last_xmm_reg = pd_last_xmm_reg; +#ifdef _LP64 + if (UseAVX < 3) { + last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1; + } +#endif + assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register"); assert(interval->assigned_regHi() == any_reg, "must not have hi register"); return LIR_OprFact::single_xmm(assigned_reg - pd_first_xmm_reg); } @@ -2112,7 +2119,13 @@ case T_DOUBLE: { #ifdef X86 if (UseSSE >= 2) { - assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register"); + int last_xmm_reg = pd_last_xmm_reg; +#ifdef _LP64 + if (UseAVX < 3) { + last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1; + } +#endif + assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register"); assert(interval->assigned_regHi() == any_reg, "must not have hi register (double xmm values are stored in one register)"); return LIR_OprFact::double_xmm(assigned_reg - pd_first_xmm_reg); } @@ -3600,7 +3613,8 @@ } #ifdef X86 - for (j = 0; j < FrameMap::nof_caller_save_xmm_regs; j++) { + int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms(); + for (j = 0; j < num_caller_save_xmm_regs; j++) { state_put(input_state, reg_num(FrameMap::caller_save_xmm_reg_at(j)), NULL); } #endif @@ -4514,12 +4528,20 @@ if (reg_num() < LIR_OprDesc::vreg_base) { type_name = "fixed"; // need a temporary operand for fixed intervals because type() cannot be called +#ifdef X86 + int last_xmm_reg = pd_last_xmm_reg; +#ifdef _LP64 + if (UseAVX < 3) { + last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1; + } +#endif +#endif if (assigned_reg() >= pd_first_cpu_reg && assigned_reg() <= pd_last_cpu_reg) { opr = LIR_OprFact::single_cpu(assigned_reg()); } else if (assigned_reg() >= pd_first_fpu_reg && assigned_reg() <= pd_last_fpu_reg) { opr = LIR_OprFact::single_fpu(assigned_reg() - pd_first_fpu_reg); #ifdef X86 - } else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= pd_last_xmm_reg) { + } else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= last_xmm_reg) { opr = LIR_OprFact::single_xmm(assigned_reg() - pd_first_xmm_reg); #endif } else { --- old/src/share/vm/opto/c2_globals.hpp 2015-04-23 08:27:07.366343300 -0700 +++ new/src/share/vm/opto/c2_globals.hpp 2015-04-23 08:27:07.169343300 -0700 @@ -96,7 +96,7 @@ product(intx, MaxLoopPad, (OptoLoopAlignment-1), \ "Align a loop if padding size in bytes is less or equal to this value") \ \ - product(intx, MaxVectorSize, 32, \ + product(intx, MaxVectorSize, 64, \ "Max vector size in bytes, " \ "actual size could be less depending on elements type") \ \ --- old/src/share/vm/opto/chaitin.cpp 2015-04-23 08:27:11.720343300 -0700 +++ new/src/share/vm/opto/chaitin.cpp 2015-04-23 08:27:11.527343300 -0700 @@ -907,6 +907,13 @@ lrg.set_num_regs(RegMask::SlotsPerVecY); lrg.set_reg_pressure(1); break; + case Op_VecZ: + assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecZ), "sanity"); + assert(RegMask::num_registers(Op_VecZ) == RegMask::SlotsPerVecZ, "sanity"); + assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecZ), "vector should be aligned"); + lrg.set_num_regs(RegMask::SlotsPerVecZ); + lrg.set_reg_pressure(1); + break; default: ShouldNotReachHere(); } @@ -1514,7 +1521,7 @@ int n_regs = lrg->num_regs(); assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity"); if (n_regs == 1 || !lrg->_fat_proj) { - assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity"); + assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity"); lrg->Clear(); // Clear the mask lrg->Insert(reg); // Set regmask to match selected reg // For vectors and pairs, also insert the low bit of the pair --- old/src/share/vm/opto/chaitin.hpp 2015-04-23 08:27:15.872343300 -0700 +++ new/src/share/vm/opto/chaitin.hpp 2015-04-23 08:27:15.673343300 -0700 @@ -141,7 +141,7 @@ // Number of registers this live range uses when it colors private: - uint8_t _num_regs; // 2 for Longs and Doubles, 1 for all else + uint16_t _num_regs; // 2 for Longs and Doubles, 1 for all else // except _num_regs is kill count for fat_proj public: int num_regs() const { return _num_regs; } @@ -150,7 +150,7 @@ private: // Number of physical registers this live range uses when it colors // Architecture and register-set dependent - uint8_t _reg_pressure; + uint16_t _reg_pressure; public: void set_reg_pressure(int i) { _reg_pressure = i; } int reg_pressure() const { return _reg_pressure; } --- old/src/share/vm/opto/classes.hpp 2015-04-23 08:27:19.818343300 -0700 +++ new/src/share/vm/opto/classes.hpp 2015-04-23 08:27:19.529343300 -0700 @@ -282,6 +282,8 @@ macro(MulVS) macro(MulVI) macro(MulReductionVI) +macro(MulVL) +macro(MulReductionVL) macro(MulVF) macro(MulReductionVF) macro(MulVD) --- old/src/share/vm/opto/compile.cpp 2015-04-23 08:27:24.135343300 -0700 +++ new/src/share/vm/opto/compile.cpp 2015-04-23 08:27:23.914343300 -0700 @@ -3054,6 +3054,7 @@ case Op_AddReductionVF: case Op_AddReductionVD: case Op_MulReductionVI: + case Op_MulReductionVL: case Op_MulReductionVF: case Op_MulReductionVD: break; --- old/src/share/vm/opto/matcher.cpp 2015-04-23 08:27:28.624343300 -0700 +++ new/src/share/vm/opto/matcher.cpp 2015-04-23 08:27:28.430343300 -0700 @@ -83,6 +83,7 @@ idealreg2spillmask [Op_VecD] = NULL; idealreg2spillmask [Op_VecX] = NULL; idealreg2spillmask [Op_VecY] = NULL; + idealreg2spillmask [Op_VecZ] = NULL; idealreg2debugmask [Op_RegI] = NULL; idealreg2debugmask [Op_RegN] = NULL; @@ -94,6 +95,7 @@ idealreg2debugmask [Op_VecD] = NULL; idealreg2debugmask [Op_VecX] = NULL; idealreg2debugmask [Op_VecY] = NULL; + idealreg2debugmask [Op_VecZ] = NULL; idealreg2mhdebugmask[Op_RegI] = NULL; idealreg2mhdebugmask[Op_RegN] = NULL; @@ -105,6 +107,7 @@ idealreg2mhdebugmask[Op_VecD] = NULL; idealreg2mhdebugmask[Op_VecX] = NULL; idealreg2mhdebugmask[Op_VecY] = NULL; + idealreg2mhdebugmask[Op_VecZ] = NULL; debug_only(_mem_node = NULL;) // Ideal memory node consumed by mach node } @@ -413,7 +416,7 @@ void Matcher::init_first_stack_mask() { // Allocate storage for spill masks as masks for the appropriate load type. - RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4)); + RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+5)); idealreg2spillmask [Op_RegN] = &rms[0]; idealreg2spillmask [Op_RegI] = &rms[1]; @@ -440,6 +443,7 @@ idealreg2spillmask [Op_VecD] = &rms[19]; idealreg2spillmask [Op_VecX] = &rms[20]; idealreg2spillmask [Op_VecY] = &rms[21]; + idealreg2spillmask [Op_VecZ] = &rms[22]; OptoReg::Name i; @@ -524,6 +528,18 @@ *idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY]; idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask); } + if (Matcher::vector_size_supported(T_FLOAT,16)) { + // For VecZ we need enough alignment and 64 bytes (16 slots) for spills. + OptoReg::Name in = OptoReg::add(_in_arg_limit, -1); + for (int k = 1; (in >= init_in) && (k < RegMask::SlotsPerVecZ); k++) { + aligned_stack_mask.Remove(in); + in = OptoReg::add(in, -1); + } + aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecZ); + assert(aligned_stack_mask.is_AllStack(), "should be infinite stack"); + *idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ]; + idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask); + } if (UseFPUForSpilling) { // This mask logic assumes that the spill operations are // symmetric and that the registers involved are the same size. @@ -862,6 +878,10 @@ MachNode *spillVectY = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY)); idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask(); } + if (Matcher::vector_size_supported(T_FLOAT,16)) { + MachNode *spillVectZ = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTZ)); + idealreg2regmask[Op_VecZ] = &spillVectZ->out_RegMask(); + } } #ifdef ASSERT --- old/src/share/vm/opto/opcodes.cpp 2015-04-23 08:27:33.104343300 -0700 +++ new/src/share/vm/opto/opcodes.cpp 2015-04-23 08:27:32.916343300 -0700 @@ -42,6 +42,7 @@ "VecD", "VecX", "VecY", + "VecZ", "_last_machine_leaf", #include "classes.hpp" "_last_class_name", --- old/src/share/vm/opto/opcodes.hpp 2015-04-23 08:27:37.396343300 -0700 +++ new/src/share/vm/opto/opcodes.hpp 2015-04-23 08:27:37.206343300 -0700 @@ -40,6 +40,7 @@ macro(VecD) // Machine vectord register macro(VecX) // Machine vectorx register macro(VecY) // Machine vectory register + macro(VecZ) // Machine vectorz register macro(RegFlags) // Machine flags register _last_machine_leaf, // Split between regular opcodes and machine #include "classes.hpp" --- old/src/share/vm/opto/optoreg.hpp 2015-04-23 08:27:41.797343300 -0700 +++ new/src/share/vm/opto/optoreg.hpp 2015-04-23 08:27:41.608343300 -0700 @@ -103,6 +103,10 @@ return r - stack0(); } + static void invalidate(Name n) { + vm2opto[n] = Bad; + } + // convert a stack slot number into an OptoReg::Name static OptoReg::Name stack2reg( int idx) { return Name(stack0() + idx); --- old/src/share/vm/opto/output.cpp 2015-04-23 08:27:46.224343300 -0700 +++ new/src/share/vm/opto/output.cpp 2015-04-23 08:27:46.031343300 -0700 @@ -1880,8 +1880,8 @@ if (!do_scheduling()) return; - // Scheduling code works only with pairs (8 bytes) maximum. - if (max_vector_size() > 8) + // Scheduling code works only with pairs (16 bytes) maximum. + if (max_vector_size() > 16) return; TracePhase tp("isched", &timers[_t_instrSched]); --- old/src/share/vm/opto/regmask.cpp 2015-04-23 08:27:50.431343300 -0700 +++ new/src/share/vm/opto/regmask.cpp 2015-04-23 08:27:50.232343300 -0700 @@ -114,11 +114,14 @@ //============================================================================= bool RegMask::is_vector(uint ireg) { - return (ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY); + return (ireg == Op_VecS || ireg == Op_VecD || + ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ); } int RegMask::num_registers(uint ireg) { switch(ireg) { + case Op_VecZ: + return 16; case Op_VecY: return 8; case Op_VecX: @@ -233,7 +236,8 @@ return true; } -static int low_bits[3] = { 0x55555555, 0x11111111, 0x01010101 }; +// only indicies of power 2 are accessed, so index 3 is only filled in for storage. +static int low_bits[5] = { 0x55555555, 0x11111111, 0x01010101, 0x00000000, 0x00010001 }; //------------------------------find_first_set--------------------------------- // Find the lowest-numbered register set in the mask. Return the // HIGHEST register number in the set, or BAD if no sets. @@ -254,7 +258,7 @@ // Clear out partial bits; leave only aligned adjacent bit pairs void RegMask::clear_to_sets(const int size) { if (size == 1) return; - assert(2 <= size && size <= 8, "update low bits table"); + assert(2 <= size && size <= 16, "update low bits table"); assert(is_power_of_2(size), "sanity"); int low_bits_mask = low_bits[size>>2]; for (int i = 0; i < RM_SIZE; i++) { @@ -268,6 +272,9 @@ sets |= (sets>>2); // Smear 2 hi-bits into a set if (size > 4) { sets |= (sets>>4); // Smear 4 hi-bits into a set + if (size > 8) { + sets |= (sets>>8); // Smear 8 hi-bits into a set + } } } _A[i] = sets; @@ -279,7 +286,7 @@ // Smear out partial bits to aligned adjacent bit sets void RegMask::smear_to_sets(const int size) { if (size == 1) return; - assert(2 <= size && size <= 8, "update low bits table"); + assert(2 <= size && size <= 16, "update low bits table"); assert(is_power_of_2(size), "sanity"); int low_bits_mask = low_bits[size>>2]; for (int i = 0; i < RM_SIZE; i++) { @@ -294,6 +301,9 @@ sets |= (sets<<2); // Smear 2 lo-bits into a set if (size > 4) { sets |= (sets<<4); // Smear 4 lo-bits into a set + if (size > 8) { + sets |= (sets<<8); // Smear 8 lo-bits into a set + } } } _A[i] = sets; @@ -304,7 +314,7 @@ //------------------------------is_aligned_set-------------------------------- bool RegMask::is_aligned_sets(const int size) const { if (size == 1) return true; - assert(2 <= size && size <= 8, "update low bits table"); + assert(2 <= size && size <= 16, "update low bits table"); assert(is_power_of_2(size), "sanity"); int low_bits_mask = low_bits[size>>2]; // Assert that the register mask contains only bit sets. @@ -330,7 +340,7 @@ // Works also for size 1. int RegMask::is_bound_set(const int size) const { if( is_AllStack() ) return false; - assert(1 <= size && size <= 8, "update low bits table"); + assert(1 <= size && size <= 16, "update low bits table"); int bit = -1; // Set to hold the one bit allowed for (int i = 0; i < RM_SIZE; i++) { if (_A[i] ) { // Found some bits @@ -346,10 +356,12 @@ if (((-1) & ~(bit-1)) != _A[i]) return false; // Found many bits, so fail i++; // Skip iteration forward and check high part - // The lower 24 bits should be 0 since it is split case and size <= 8. - int set = bit>>24; + // The lower (32-size) bits should be 0 since it is split case. + int clear_bit_size = 32-size; + int shift_back_size = 32-clear_bit_size; + int set = bit>>clear_bit_size; set = set & -set; // Remove sign extension. - set = (((set << size) - 1) >> 8); + set = (((set << size) - 1) >> shift_back_size); if (i >= RM_SIZE || _A[i] != set) return false; // Require expected low bits in next word } @@ -375,7 +387,7 @@ //------------------------------Size------------------------------------------- // Compute size of register mask in bits uint RegMask::Size() const { - extern uint8_t bitsInByte[256]; + extern uint8_t bitsInByte[512]; uint sum = 0; for( int i = 0; i < RM_SIZE; i++ ) sum += --- old/src/share/vm/opto/regmask.hpp 2015-04-23 08:27:54.783343300 -0700 +++ new/src/share/vm/opto/regmask.hpp 2015-04-23 08:27:54.594343300 -0700 @@ -98,7 +98,8 @@ SlotsPerVecS = 1, SlotsPerVecD = 2, SlotsPerVecX = 4, - SlotsPerVecY = 8 }; + SlotsPerVecY = 8, + SlotsPerVecZ = 16 }; // A constructor only used by the ADLC output. All mask fields are filled // in directly. Calls to this look something like RM(1,2,3,4); @@ -299,13 +300,13 @@ static bool can_represent(OptoReg::Name reg) { // NOTE: -1 in computation reflects the usage of the last // bit of the regmask as an infinite stack flag and - // -7 is to keep mask aligned for largest value (VecY). + // -7 is to keep mask aligned for largest value (VecZ). return (int)reg < (int)(CHUNK_SIZE-1); } static bool can_represent_arg(OptoReg::Name reg) { - // NOTE: -SlotsPerVecY in computation reflects the need - // to keep mask aligned for largest value (VecY). - return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecY); + // NOTE: -SlotsPerVecZ in computation reflects the need + // to keep mask aligned for largest value (VecZ). + return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecZ); } }; --- old/src/share/vm/opto/type.cpp 2015-04-23 08:27:59.173343300 -0700 +++ new/src/share/vm/opto/type.cpp 2015-04-23 08:27:58.971343300 -0700 @@ -68,16 +68,19 @@ { Bad, T_ILLEGAL, "vectord:", false, Op_RegD, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY + { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #elif defined(PPC64) { Bad, T_ILLEGAL, "vectors:", false, 0, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_RegL, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY + { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #else // all other { Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX { Bad, T_ILLEGAL, "vectory:", false, Op_VecY, relocInfo::none }, // VectorY + { Bad, T_ILLEGAL, "vectorz:", false, Op_VecZ, relocInfo::none }, // VectorZ #endif { Bad, T_ADDRESS, "anyptr:", false, Op_RegP, relocInfo::none }, // AnyPtr { Bad, T_ADDRESS, "rawptr:", false, Op_RegP, relocInfo::none }, // RawPtr @@ -503,10 +506,14 @@ if (Matcher::vector_size_supported(T_FLOAT,8)) { TypeVect::VECTY = TypeVect::make(T_FLOAT,8); } + if (Matcher::vector_size_supported(T_FLOAT,16)) { + TypeVect::VECTZ = TypeVect::make(T_FLOAT,16); + } mreg2type[Op_VecS] = TypeVect::VECTS; mreg2type[Op_VecD] = TypeVect::VECTD; mreg2type[Op_VecX] = TypeVect::VECTX; mreg2type[Op_VecY] = TypeVect::VECTY; + mreg2type[Op_VecZ] = TypeVect::VECTZ; // Restore working type arena. current->set_type_arena(save); @@ -798,6 +805,7 @@ Bad, // VectorD - handled in v-call Bad, // VectorX - handled in v-call Bad, // VectorY - handled in v-call + Bad, // VectorZ - handled in v-call Bad, // AnyPtr - handled in v-call Bad, // RawPtr - handled in v-call @@ -2051,6 +2059,7 @@ const TypeVect *TypeVect::VECTD = NULL; // 64-bit vectors const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors +const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors //------------------------------make------------------------------------------- const TypeVect* TypeVect::make(const Type *elem, uint length) { @@ -2070,6 +2079,8 @@ return (TypeVect*)(new TypeVectX(elem, length))->hashcons(); case Op_VecY: return (TypeVect*)(new TypeVectY(elem, length))->hashcons(); + case Op_VecZ: + return (TypeVect*)(new TypeVectZ(elem, length))->hashcons(); } ShouldNotReachHere(); return NULL; @@ -2093,7 +2104,8 @@ case VectorS: case VectorD: case VectorX: - case VectorY: { // Meeting 2 vectors? + case VectorY: + case VectorZ: { // Meeting 2 vectors? const TypeVect* v = t->is_vect(); assert( base() == v->base(), ""); assert(length() == v->length(), ""); @@ -2151,6 +2163,8 @@ st->print("vectorx["); break; case VectorY: st->print("vectory["); break; + case VectorZ: + st->print("vectorz["); break; default: ShouldNotReachHere(); } --- old/src/share/vm/opto/type.hpp 2015-04-23 08:28:03.761343300 -0700 +++ new/src/share/vm/opto/type.hpp 2015-04-23 08:28:03.570343300 -0700 @@ -57,6 +57,7 @@ class TypeVectD; class TypeVectX; class TypeVectY; +class TypeVectZ; class TypePtr; class TypeRawPtr; class TypeOopPtr; @@ -90,6 +91,7 @@ VectorD, // 64bit Vector types VectorX, // 128bit Vector types VectorY, // 256bit Vector types + VectorZ, // 512bit Vector types AnyPtr, // Any old raw, klass, inst, or array pointer RawPtr, // Raw (non-oop) pointers @@ -729,6 +731,7 @@ static const TypeVect *VECTD; static const TypeVect *VECTX; static const TypeVect *VECTY; + static const TypeVect *VECTZ; #ifndef PRODUCT virtual void dump2(Dict &d, uint, outputStream *st) const; // Specialized per-Type dumping @@ -755,6 +758,11 @@ TypeVectY(const Type* elem, uint length) : TypeVect(VectorY, elem, length) {} }; +class TypeVectZ : public TypeVect { + friend class TypeVect; + TypeVectZ(const Type* elem, uint length) : TypeVect(VectorZ, elem, length) {} +}; + //------------------------------TypePtr---------------------------------------- // Class of machine Pointer Types: raw data, instances or arrays. // If the _base enum is AnyPtr, then this refers to all of the above. @@ -1568,12 +1576,12 @@ } inline const TypeVect *Type::is_vect() const { - assert( _base >= VectorS && _base <= VectorY, "Not a Vector" ); + assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" ); return (TypeVect*)this; } inline const TypeVect *Type::isa_vect() const { - return (_base >= VectorS && _base <= VectorY) ? (TypeVect*)this : NULL; + return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL; } inline const TypePtr *Type::is_ptr() const { --- old/src/share/vm/opto/vectornode.cpp 2015-04-23 08:28:08.251343300 -0700 +++ new/src/share/vm/opto/vectornode.cpp 2015-04-23 08:28:08.061343300 -0700 @@ -77,6 +77,9 @@ case T_INT: return Op_MulVI; } ShouldNotReachHere(); + case Op_MulL: + assert(bt == T_LONG, "must be"); + return Op_MulVL; case Op_MulF: assert(bt == T_FLOAT, "must be"); return Op_MulVF; @@ -267,6 +270,7 @@ case Op_MulVS: return new MulVSNode(n1, n2, vt); case Op_MulVI: return new MulVINode(n1, n2, vt); + case Op_MulVL: return new MulVLNode(n1, n2, vt); case Op_MulVF: return new MulVFNode(n1, n2, vt); case Op_MulVD: return new MulVDNode(n1, n2, vt); @@ -463,6 +467,10 @@ assert(bt == T_INT, "must be"); vopc = Op_MulReductionVI; break; + case Op_MulL: + assert(bt == T_LONG, "must be"); + vopc = Op_MulReductionVL; + break; case Op_MulF: assert(bt == T_FLOAT, "must be"); vopc = Op_MulReductionVF; @@ -492,6 +500,7 @@ case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2); case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2); case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2); + case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2); case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2); case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2); } --- old/src/share/vm/opto/vectornode.hpp 2015-04-23 08:28:12.352343300 -0700 +++ new/src/share/vm/opto/vectornode.hpp 2015-04-23 08:28:12.136343300 -0700 @@ -90,6 +90,30 @@ virtual int Opcode() const; }; +//------------------------------AddVLNode-------------------------------------- +// Vector add long +class AddVLNode : public VectorNode { +public: + AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AddVFNode-------------------------------------- +// Vector add float +class AddVFNode : public VectorNode { +public: + AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AddVDNode-------------------------------------- +// Vector add double +class AddVDNode : public VectorNode { +public: + AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + //------------------------------ReductionNode------------------------------------ // Perform reduction of a vector class ReductionNode : public Node { @@ -121,22 +145,6 @@ virtual uint ideal_reg() const { return Op_RegL; } }; -//------------------------------AddVLNode-------------------------------------- -// Vector add long -class AddVLNode : public VectorNode { - public: - AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} - virtual int Opcode() const; -}; - -//------------------------------AddVFNode-------------------------------------- -// Vector add float -class AddVFNode : public VectorNode { - public: - AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} - virtual int Opcode() const; -}; - //------------------------------AddReductionVFNode-------------------------------------- // Vector add float as a reduction class AddReductionVFNode : public ReductionNode { @@ -147,14 +155,6 @@ virtual uint ideal_reg() const { return Op_RegF; } }; -//------------------------------AddVDNode-------------------------------------- -// Vector add double -class AddVDNode : public VectorNode { - public: - AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} - virtual int Opcode() const; -}; - //------------------------------AddReductionVDNode-------------------------------------- // Vector add double as a reduction class AddReductionVDNode : public ReductionNode { @@ -229,6 +229,30 @@ virtual int Opcode() const; }; +//------------------------------MulVLNode-------------------------------------- +// Vector multiply long +class MulVLNode : public VectorNode { +public: + MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + +//------------------------------MulVFNode-------------------------------------- +// Vector multiply float +class MulVFNode : public VectorNode { +public: + MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + +//------------------------------MulVDNode-------------------------------------- +// Vector multiply double +class MulVDNode : public VectorNode { +public: + MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + //------------------------------MulReductionVINode-------------------------------------- // Vector multiply int as a reduction class MulReductionVINode : public ReductionNode { @@ -239,12 +263,14 @@ virtual uint ideal_reg() const { return Op_RegI; } }; -//------------------------------MulVFNode-------------------------------------- -// Vector multiply float -class MulVFNode : public VectorNode { - public: - MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} +//------------------------------MulReductionVLNode-------------------------------------- +// Vector multiply int as a reduction +class MulReductionVLNode : public ReductionNode { +public: + MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual const Type* bottom_type() const { return TypeLong::LONG; } + virtual uint ideal_reg() const { return Op_RegI; } }; //------------------------------MulReductionVFNode-------------------------------------- @@ -257,14 +283,6 @@ virtual uint ideal_reg() const { return Op_RegF; } }; -//------------------------------MulVDNode-------------------------------------- -// Vector multiply double -class MulVDNode : public VectorNode { - public: - MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} - virtual int Opcode() const; -}; - //------------------------------MulReductionVDNode-------------------------------------- // Vector multiply double as a reduction class MulReductionVDNode : public ReductionNode { --- old/src/share/vm/runtime/vmStructs.cpp 2015-04-23 08:28:15.906343300 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2015-04-23 08:28:15.648343300 -0700 @@ -2004,6 +2004,8 @@ declare_c2_type(SubVFNode, VectorNode) \ declare_c2_type(SubVDNode, VectorNode) \ declare_c2_type(MulVSNode, VectorNode) \ + declare_c2_type(MulVLNode, VectorNode) \ + declare_c2_type(MulReductionVLNode, ReductionNode) \ declare_c2_type(MulVINode, VectorNode) \ declare_c2_type(MulReductionVINode, ReductionNode) \ declare_c2_type(MulVFNode, VectorNode) \ --- /dev/null 2015-04-23 08:28:20.000000000 -0700 +++ new/test/compiler/loopopts/superword/SumRed_Long.java 2015-04-23 08:28:19.731343300 -0700 @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8076276 + * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : long test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double + * + */ + +public class SumRed_Long +{ + public static void main(String[] args) throws Exception { + long[] a = new long[256*1024]; + long[] b = new long[256*1024]; + long[] c = new long[256*1024]; + long[] d = new long[256*1024]; + sumReductionInit(a,b,c); + long total = 0; + long valid = 262144000; + for(int j = 0; j < 2000; j++) { + total = sumReductionImplement(a,b,c,d,total); + } + total = (int)total; + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void sumReductionInit( + long[] a, + long[] b, + long[] c) + { + for(int j = 0; j < 1; j++) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i * 1 + j; + b[i] = i * 1 - j; + c[i] = i + j; + } + } + } + + public static long sumReductionImplement( + long[] a, + long[] b, + long[] c, + long[] d, + long total) + { + for(int i = 0; i < a.length; i++) + { + d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += d[i]; + } + return total; + } + +}