--- old/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp 2020-04-02 18:03:44.388854098 -0700 +++ new/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp 2020-04-02 18:03:44.220854098 -0700 @@ -33,6 +33,21 @@ #include "runtime/objectMonitor.hpp" #include "runtime/stubRoutines.hpp" +inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { + switch (vlen_in_bytes) { + case 4: // fall-through + case 8: // fall-through + case 16: return Assembler::AVX_128bit; + case 32: return Assembler::AVX_256bit; + case 64: return Assembler::AVX_512bit; + + default: { + ShouldNotReachHere(); + return Assembler::AVX_NoVec; + } + } +} + void C2_MacroAssembler::setvectmask(Register dst, Register src) { guarantee(PostLoopMultiversioning, "must be"); Assembler::movl(dst, 1); @@ -855,6 +870,167 @@ } } +void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { + assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); + + if (opcode == Op_MinV) { + if (elem_bt == T_BYTE) { + pminsb(dst, src); + } else if (elem_bt == T_SHORT) { + pminsw(dst, src); + } else if (elem_bt == T_INT) { + pminsd(dst, src); + } else { + assert(elem_bt == T_LONG, "required"); + assert(tmp == xmm0, "required"); + movdqu(xmm0, dst); + pcmpgtq(xmm0, src); + blendvpd(dst, src); // xmm0 as mask + } + } else { // opcode == Op_MaxV + if (elem_bt == T_BYTE) { + pmaxsb(dst, src); + } else if (elem_bt == T_SHORT) { + pmaxsw(dst, src); + } else if (elem_bt == T_INT) { + pmaxsd(dst, src); + } else { + assert(elem_bt == T_LONG, "required"); + assert(tmp == xmm0, "required"); + movdqu(xmm0, src); + pcmpgtq(xmm0, dst); + blendvpd(dst, src); // xmm0 as mask + } + } +} + +void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister src1, XMMRegister src2, + int vlen_enc) { + assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); + + if (opcode == Op_MinV) { + if (elem_bt == T_BYTE) { + vpminsb(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_SHORT) { + vpminsw(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_INT) { + vpminsd(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_LONG, "required"); + if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + vpminsq(dst, src1, src2, vlen_enc); + } else { + vpcmpgtq(dst, src1, src2, vlen_enc); + vblendvpd(dst, src1, src2, dst, vlen_enc); + } + } + } else { // opcode == Op_MaxV + if (elem_bt == T_BYTE) { + vpmaxsb(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_SHORT) { + vpmaxsw(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_INT) { + vpmaxsd(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_LONG, "required"); + if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + vpmaxsq(dst, src1, src2, vlen_enc); + } else { + vpcmpgtq(dst, src1, src2, vlen_enc); + vblendvpd(dst, src2, src1, dst, vlen_enc); + } + } + } +} + +// Float/Double min max + +void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc) { + assert(UseAVX > 0, "required"); + assert(opcode == Op_MinV || opcode == Op_MinReductionV || + opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); + assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); + + bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); + bool is_double_word = is_double_word_type(elem_bt); + + if (!is_double_word && is_min) { + vblendvps(atmp, a, b, a, vlen_enc); + vblendvps(btmp, b, a, a, vlen_enc); + vminps(tmp, atmp, btmp, vlen_enc); + vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvps(dst, tmp, atmp, btmp, vlen_enc); + } else if (!is_double_word && !is_min) { + vblendvps(btmp, b, a, b, vlen_enc); + vblendvps(atmp, a, b, b, vlen_enc); + vmaxps(tmp, atmp, btmp, vlen_enc); + vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvps(dst, tmp, atmp, btmp, vlen_enc); + } else if (is_double_word && is_min) { + vblendvpd(atmp, a, b, a, vlen_enc); + vblendvpd(btmp, b, a, a, vlen_enc); + vminpd(tmp, atmp, btmp, vlen_enc); + vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvpd(dst, tmp, atmp, btmp, vlen_enc); + } else { + assert(is_double_word && !is_min, "sanity"); + vblendvpd(btmp, b, a, b, vlen_enc); + vblendvpd(atmp, a, b, b, vlen_enc); + vmaxpd(tmp, atmp, btmp, vlen_enc); + vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvpd(dst, tmp, atmp, btmp, vlen_enc); + } +} + +void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + KRegister ktmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc) { + assert(UseAVX > 2, "required"); + assert(opcode == Op_MinV || opcode == Op_MinReductionV || + opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); + assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); + + bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); + bool is_double_word = is_double_word_type(elem_bt); + bool merge = true; + + if (!is_double_word && is_min) { + evpmovd2m(ktmp, a, vlen_enc); + evblendmps(atmp, ktmp, a, b, merge, vlen_enc); + evblendmps(btmp, ktmp, b, a, merge, vlen_enc); + vminps(dst, atmp, btmp, vlen_enc); + evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdqul(dst, ktmp, atmp, merge, vlen_enc); + } else if (!is_double_word && !is_min) { + evpmovd2m(ktmp, b, vlen_enc); + evblendmps(atmp, ktmp, a, b, merge, vlen_enc); + evblendmps(btmp, ktmp, b, a, merge, vlen_enc); + vmaxps(dst, atmp, btmp, vlen_enc); + evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdqul(dst, ktmp, atmp, merge, vlen_enc); + } else if (is_double_word && is_min) { + evpmovq2m(ktmp, a, vlen_enc); + evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); + evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); + vminpd(dst, atmp, btmp, vlen_enc); + evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdquq(dst, ktmp, atmp, merge, vlen_enc); + } else { + assert(is_double_word && !is_min, "sanity"); + evpmovq2m(ktmp, b, vlen_enc); + evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); + evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); + vmaxpd(dst, atmp, btmp, vlen_enc); + evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdquq(dst, ktmp, atmp, merge, vlen_enc); + } +} + void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { if (sign) { pmovsxbw(dst, src); @@ -871,111 +1047,418 @@ } } -void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { - if (opcode == Op_RShiftVI) { - psrad(dst, src); - } else if (opcode == Op_LShiftVI) { - pslld(dst, src); +void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { + if (sign) { + vpmovsxbd(dst, src, vector_len); } else { - assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); - psrld(dst, src); + vpmovzxbd(dst, src, vector_len); } } -void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - if (opcode == Op_RShiftVI) { - vpsrad(dst, nds, src, vector_len); - } else if (opcode == Op_LShiftVI) { - vpslld(dst, nds, src, vector_len); +void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { + if (sign) { + vpmovsxwd(dst, src, vector_len); } else { - assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); - vpsrld(dst, nds, src, vector_len); + vpmovzxwd(dst, src, vector_len); } } -void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { - if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { - psraw(dst, src); - } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { - psllw(dst, src); - } else { - assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); - psrlw(dst, src); +void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { + switch (opcode) { + case Op_RShiftVI: psrad(dst, shift); break; + case Op_LShiftVI: pslld(dst, shift); break; + case Op_URShiftVI: psrld(dst, shift); break; + + default: assert(false, "%s", NodeClassNames[opcode]); } } -void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { - vpsraw(dst, nds, src, vector_len); - } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { - vpsllw(dst, nds, src, vector_len); - } else { - assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); - vpsrlw(dst, nds, src, vector_len); +void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; + case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; + case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); } } -void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { - if (opcode == Op_RShiftVL) { - psrlq(dst, src); // using srl to implement sra on pre-avs512 systems - } else if (opcode == Op_LShiftVL) { - psllq(dst, src); +void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { + switch (opcode) { + case Op_RShiftVB: // fall-through + case Op_RShiftVS: psraw(dst, shift); break; + + case Op_LShiftVB: // fall-through + case Op_LShiftVS: psllw(dst, shift); break; + + case Op_URShiftVS: // fall-through + case Op_URShiftVB: psrlw(dst, shift); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_RShiftVB: // fall-through + case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; + + case Op_LShiftVB: // fall-through + case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; + + case Op_URShiftVS: // fall-through + case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { + switch (opcode) { + case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems + case Op_LShiftVL: psllq(dst, shift); break; + case Op_URShiftVL: psrlq(dst, shift); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; + case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; + case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_VRShiftV: vpsravd(dst, src, shift, vlen_enc); break; + case Op_VLShiftV: vpsllvd(dst, src, shift, vlen_enc); break; + case Op_VURShiftV: vpsrlvd(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_VRShiftV: evpsravw(dst, src, shift, vlen_enc); break; + case Op_VLShiftV: evpsllvw(dst, src, shift, vlen_enc); break; + case Op_VURShiftV: evpsrlvw(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { + assert(UseAVX >= 2, "required"); + switch (opcode) { + case Op_VRShiftV: { + if (UseAVX > 2) { + assert(tmp == xnoreg, "not used"); + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + evpsravq(dst, src, shift, vlen_enc); + } else { + vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); + vpsrlvq(dst, src, shift, vlen_enc); + vpsrlvq(tmp, tmp, shift, vlen_enc); + vpxor(dst, dst, tmp, vlen_enc); + vpsubq(dst, dst, tmp, vlen_enc); + } + break; + } + case Op_VLShiftV: { + assert(tmp == xnoreg, "not used"); + vpsllvq(dst, src, shift, vlen_enc); + break; + } + case Op_VURShiftV: { + assert(tmp == xnoreg, "not used"); + vpsrlvq(dst, src, shift, vlen_enc); + break; + } + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst +void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { + bool sign = (opcode == Op_VURShiftV) ? false : true; + assert(vector_len == 0, "required"); + vextendbd(sign, dst, src, 1); + vpmovzxbd(vtmp, shift, 1); + varshiftd(opcode, dst, dst, vtmp, 1); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); + vextracti128_high(vtmp, dst); + vpackusdw(dst, dst, vtmp, 0); +} + +// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst +void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { + bool sign = (opcode == Op_VURShiftV) ? false : true; + int ext_vector_len = vector_len + 1; + vextendbw(sign, dst, src, ext_vector_len); + vpmovzxbw(vtmp, shift, ext_vector_len); + varshiftw(opcode, dst, dst, vtmp, ext_vector_len); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); + if (vector_len == 0) { + vextracti128_high(vtmp, dst); + vpackuswb(dst, dst, vtmp, vector_len); } else { - assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); - psrlq(dst, src); + vextracti64x4_high(vtmp, dst); + vpackuswb(dst, dst, vtmp, vector_len); + vpermq(dst, dst, 0xD8, vector_len); + } +} + +void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { + switch(typ) { + case T_BYTE: + pinsrb(dst, val, idx); + break; + case T_SHORT: + pinsrw(dst, val, idx); + break; + case T_INT: + pinsrd(dst, val, idx); + break; + case T_LONG: + pinsrq(dst, val, idx); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { + switch(typ) { + case T_BYTE: + vpinsrb(dst, src, val, idx); + break; + case T_SHORT: + vpinsrw(dst, src, val, idx); + break; + case T_INT: + vpinsrd(dst, src, val, idx); + break; + case T_LONG: + vpinsrq(dst, src, val, idx); + break; + default: + assert(false,"Should not reach here."); + break; } } -void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - if (opcode == Op_RShiftVL) { - evpsraq(dst, nds, src, vector_len); - } else if (opcode == Op_LShiftVL) { - vpsllq(dst, nds, src, vector_len); +void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { + switch(typ) { + case T_INT: + vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); + break; + case T_FLOAT: + vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); + break; + case T_LONG: + vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); + break; + case T_DOUBLE: + vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { + switch(typ) { + case T_INT: + evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); + break; + case T_FLOAT: + evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); + break; + case T_LONG: + evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); + break; + case T_DOUBLE: + evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { + switch(typ) { + case T_INT: + evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); + break; + case T_FLOAT: + evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); + break; + case T_LONG: + evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); + break; + case T_DOUBLE: + evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) { + if (vlen_in_bytes <= 16) { + pxor (dst, dst); + psubb(dst, src); + switch (elem_bt) { + case T_BYTE: /* nothing to do */ break; + case T_SHORT: pmovsxbw(dst, dst); break; + case T_INT: pmovsxbd(dst, dst); break; + case T_FLOAT: pmovsxbd(dst, dst); break; + case T_LONG: pmovsxbq(dst, dst); break; + case T_DOUBLE: pmovsxbq(dst, dst); break; + + default: assert(false, "%s", type2name(elem_bt)); + } } else { - assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); - vpsrlq(dst, nds, src, vector_len); + int vlen_enc = vector_length_encoding(vlen_in_bytes); + + vpxor (dst, dst, dst, vlen_enc); + vpsubb(dst, dst, src, vlen_enc); + switch (elem_bt) { + case T_BYTE: /* nothing to do */ break; + case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; + case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; + case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; + case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; + case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; + + default: assert(false, "%s", type2name(elem_bt)); + } } } -// Reductions for vectors of ints, longs, floats, and doubles. +void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { + ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); + if (vlen_in_bytes <= 16) { + movdqu(dst, addr, scratch); + } else if (vlen_in_bytes == 32) { + vmovdqu(dst, addr, scratch); + } else { + assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); + evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); + } +} +// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. -void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) { +void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { int vector_len = Assembler::AVX_128bit; switch (opcode) { case Op_AndReductionV: pand(dst, src); break; case Op_OrReductionV: por (dst, src); break; case Op_XorReductionV: pxor(dst, src); break; - + case Op_MinReductionV: + switch (typ) { + case T_BYTE: pminsb(dst, src); break; + case T_SHORT: pminsw(dst, src); break; + case T_INT: pminsd(dst, src); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpminsq(dst, dst, src, Assembler::AVX_128bit); break; + default: assert(false, "wrong type"); + } + break; + case Op_MaxReductionV: + switch (typ) { + case T_BYTE: pmaxsb(dst, src); break; + case T_SHORT: pmaxsw(dst, src); break; + case T_INT: pmaxsd(dst, src); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; + default: assert(false, "wrong type"); + } + break; case Op_AddReductionVF: addss(dst, src); break; case Op_AddReductionVD: addsd(dst, src); break; - case Op_AddReductionVI: paddd(dst, src); break; + case Op_AddReductionVI: + switch (typ) { + case T_BYTE: paddb(dst, src); break; + case T_SHORT: paddw(dst, src); break; + case T_INT: paddd(dst, src); break; + default: assert(false, "wrong type"); + } + break; case Op_AddReductionVL: paddq(dst, src); break; - case Op_MulReductionVF: mulss(dst, src); break; case Op_MulReductionVD: mulsd(dst, src); break; - case Op_MulReductionVI: pmulld(dst, src); break; - case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break; - - default: assert(false, "wrong opcode"); + case Op_MulReductionVI: + switch (typ) { + case T_SHORT: pmullw(dst, src); break; + case T_INT: pmulld(dst, src); break; + default: assert(false, "wrong type"); + } + break; + case Op_MulReductionVL: assert(UseAVX > 2, "required"); + vpmullq(dst, dst, src, vector_len); break; + default: assert(false, "wrong opcode"); } } -void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { +void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { int vector_len = Assembler::AVX_256bit; switch (opcode) { case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; - - case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break; + case Op_MinReductionV: + switch (typ) { + case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; + case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; + case T_INT: vpminsd(dst, src1, src2, vector_len); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpminsq(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; + case Op_MaxReductionV: + switch (typ) { + case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; + case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; + case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpmaxsq(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; + case Op_AddReductionVI: + switch (typ) { + case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; + case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; + case T_INT: vpaddd(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; - - case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break; + case Op_MulReductionVI: + switch (typ) { + case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; + case T_INT: vpmulld(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; - - default: assert(false, "wrong opcode"); + default: assert(false, "wrong opcode"); } } @@ -997,9 +1480,48 @@ } } +void C2_MacroAssembler::reduceB(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } +} + +void C2_MacroAssembler::mulreduceB(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } +} + +void C2_MacroAssembler::reduceS(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } +} + void C2_MacroAssembler::reduceI(int opcode, int vlen, - Register dst, Register src1, XMMRegister src2, - XMMRegister vtmp1, XMMRegister vtmp2) { + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { switch (vlen) { case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; @@ -1012,8 +1534,8 @@ #ifdef _LP64 void C2_MacroAssembler::reduceL(int opcode, int vlen, - Register dst, Register src1, XMMRegister src2, - XMMRegister vtmp1, XMMRegister vtmp2) { + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { switch (vlen) { case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; @@ -1068,10 +1590,10 @@ phaddd(vtmp1, vtmp1); } else { pshufd(vtmp1, src2, 0x1); - reduce_operation_128(opcode, vtmp1, src2); + reduce_operation_128(T_INT, opcode, vtmp1, src2); } movdl(vtmp2, src1); - reduce_operation_128(opcode, vtmp1, vtmp2); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); movdl(dst, vtmp1); } @@ -1084,7 +1606,7 @@ reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { pshufd(vtmp2, src2, 0xE); - reduce_operation_128(opcode, vtmp2, src2); + reduce_operation_128(T_INT, opcode, vtmp2, src2); reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } } @@ -1097,51 +1619,176 @@ reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { vextracti128_high(vtmp1, src2); - reduce_operation_128(opcode, vtmp1, src2); + reduce_operation_128(T_INT, opcode, vtmp1, src2); reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } } void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); - reduce_operation_256(opcode, vtmp2, vtmp2, src2); + reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } +void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pshufd(vtmp2, src2, 0x1); + reduce_operation_128(T_BYTE, opcode, vtmp2, src2); + movdqu(vtmp1, vtmp2); + psrldq(vtmp1, 2); + reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); + movdqu(vtmp2, vtmp1); + psrldq(vtmp2, 1); + reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); + movdl(vtmp2, src1); + pmovsxbd(vtmp1, vtmp1); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); + pextrb(dst, vtmp1, 0x0); + movsbl(dst, dst); +} + +void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pshufd(vtmp1, src2, 0xE); + reduce_operation_128(T_BYTE, opcode, vtmp1, src2); + reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + vextracti128_high(vtmp2, src2); + reduce_operation_128(T_BYTE, opcode, vtmp2, src2); + reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + vextracti64x4_high(vtmp1, src2); + reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); + reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + +void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (UseAVX > 1) { + int vector_len = Assembler::AVX_256bit; + vpmovsxbw(vtmp1, src2, vector_len); + reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } else { + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); + pshufd(vtmp2, src2, 0x1); + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } +} + +void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (UseAVX > 2 && VM_Version::supports_avx512bw()) { + int vector_len = Assembler::AVX_512bit; + vpmovsxbw(vtmp1, src2, vector_len); + reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } else { + assert(UseAVX >= 2,"Should not reach here."); + mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); + vextracti128_high(vtmp2, src2); + mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } +} + +void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); + vextracti64x4_high(vtmp2, src2); + mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + if (vtmp1 != src2) { + movdqu(vtmp1, src2); + } + phaddw(vtmp1, vtmp1); + phaddw(vtmp1, vtmp1); + } else { + pshufd(vtmp2, src2, 0x1); + reduce_operation_128(T_SHORT, opcode, vtmp2, src2); + movdqu(vtmp1, vtmp2); + psrldq(vtmp1, 2); + reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); + } + movdl(vtmp2, src1); + pmovsxwd(vtmp1, vtmp1); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); + pextrw(dst, vtmp1, 0x0); + movswl(dst, dst); +} + +void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + if (vtmp1 != src2) { + movdqu(vtmp1, src2); + } + phaddw(vtmp1, src2); + } else { + pshufd(vtmp1, src2, 0xE); + reduce_operation_128(T_SHORT, opcode, vtmp1, src2); + } + reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + int vector_len = Assembler::AVX_256bit; + vphaddw(vtmp2, src2, src2, vector_len); + vpermq(vtmp2, vtmp2, 0xD8, vector_len); + } else { + vextracti128_high(vtmp2, src2); + reduce_operation_128(T_SHORT, opcode, vtmp2, src2); + } + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + int vector_len = Assembler::AVX_256bit; + vextracti64x4_high(vtmp1, src2); + reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); + reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + #ifdef _LP64 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { pshufd(vtmp2, src2, 0xE); - reduce_operation_128(opcode, vtmp2, src2); + reduce_operation_128(T_LONG, opcode, vtmp2, src2); movdq(vtmp1, src1); - reduce_operation_128(opcode, vtmp1, vtmp2); + reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); movdq(dst, vtmp1); } void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti128_high(vtmp1, src2); - reduce_operation_128(opcode, vtmp1, src2); + reduce_operation_128(T_LONG, opcode, vtmp1, src2); reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); - reduce_operation_256(opcode, vtmp2, vtmp2, src2); + reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } #endif // _LP64 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { - reduce_operation_128(opcode, dst, src); + reduce_operation_128(T_FLOAT, opcode, dst, src); pshufd(vtmp, src, 0x1); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_FLOAT, opcode, dst, vtmp); } void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { reduce2F(opcode, dst, src, vtmp); pshufd(vtmp, src, 0x2); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_FLOAT, opcode, dst, vtmp); pshufd(vtmp, src, 0x3); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_FLOAT, opcode, dst, vtmp); } void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { @@ -1157,9 +1804,9 @@ } void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { - reduce_operation_128(opcode, dst, src); + reduce_operation_128(T_DOUBLE, opcode, dst, src); pshufd(vtmp, src, 0xE); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); } void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { @@ -1174,6 +1821,207 @@ reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); } +void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + XMMRegister xmm_0, XMMRegister xmm_1) { + int permconst[] = {1, 14}; + XMMRegister wsrc = src; + XMMRegister wdst = xmm_0; + XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; + + int vlen_enc = Assembler::AVX_128bit; + if (vlen == 16) { + vlen_enc = Assembler::AVX_256bit; + } + + for (int i = log2(vlen) - 1; i >=0; i--) { + if (i == 0 && !is_dst_valid) { + wdst = dst; + } + if (i == 3) { + vextracti64x4_high(wtmp, wsrc); + } else if (i == 2) { + vextracti128_high(wtmp, wsrc); + } else { // i = [0,1] + vpermilps(wtmp, wsrc, permconst[i], vlen_enc); + } + vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); + wsrc = wdst; + vlen_enc = Assembler::AVX_128bit; + } + if (is_dst_valid) { + vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); + } +} + +void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + XMMRegister xmm_0, XMMRegister xmm_1) { + XMMRegister wsrc = src; + XMMRegister wdst = xmm_0; + XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; + int vlen_enc = Assembler::AVX_128bit; + if (vlen == 8) { + vlen_enc = Assembler::AVX_256bit; + } + for (int i = log2(vlen) - 1; i >=0; i--) { + if (i == 0 && !is_dst_valid) { + wdst = dst; + } + if (i == 1) { + vextracti128_high(wtmp, wsrc); + } else if (i == 2) { + vextracti64x4_high(wtmp, wsrc); + } else { + assert(i == 0, "%d", i); + vpermilpd(wtmp, wsrc, 1, vlen_enc); + } + vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); + wsrc = wdst; + vlen_enc = Assembler::AVX_128bit; + } + if (is_dst_valid) { + vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); + } +} + +void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { + switch (bt) { + case T_BYTE: pextrb(dst, src, idx); break; + case T_SHORT: pextrw(dst, src, idx); break; + case T_INT: pextrd(dst, src, idx); break; + case T_LONG: pextrq(dst, src, idx); break; + + default: + assert(false,"Should not reach here."); + break; + } +} + +XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int lane = elemindex / elem_per_lane; + int eindex = elemindex % elem_per_lane; + + if (lane >= 2) { + assert(UseAVX > 2, "required"); + vextractf32x4(dst, src, lane & 3); + return dst; + } else if (lane > 0) { + assert(UseAVX > 0, "required"); + vextractf128(dst, src, lane); + return dst; + } else { + return src; + } +} + +void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int eindex = elemindex % elem_per_lane; + assert(is_integral_type(typ),"required"); + + if (eindex == 0) { + if (typ == T_LONG) { + movq(dst, src); + } else { + movdl(dst, src); + if (typ == T_BYTE) + movsbl(dst, dst); + else if (typ == T_SHORT) + movswl(dst, dst); + } + } else { + extract(typ, dst, src, eindex); + } +} + +void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int eindex = elemindex % elem_per_lane; + assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); + + if (eindex == 0) { + movq(dst, src); + } else { + if (typ == T_FLOAT) { + if (UseAVX == 0) { + movdqu(dst, src); + pshufps(dst, dst, eindex); + } else { + vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); + } + } else { + if (UseAVX == 0) { + movdqu(dst, src); + psrldq(dst, eindex*esize); + } else { + vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); + } + movq(dst, dst); + } + } + // Zero upper bits + if (typ == T_FLOAT) { + if (UseAVX == 0) { + assert((vtmp != xnoreg) && (tmp != noreg), "required."); + movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); + pand(dst, vtmp); + } else { + assert((tmp != noreg), "required."); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); + } + } +} + +void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { + switch(typ) { + case T_BYTE: + evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_SHORT: + evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_INT: + case T_FLOAT: + evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_LONG: + case T_DOUBLE: + evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + switch(typ) { + case T_BYTE: + evpblendmb(dst, kmask, src1, src2, merge, vector_len); + break; + case T_SHORT: + evpblendmw(dst, kmask, src1, src2, merge, vector_len); + break; + case T_INT: + case T_FLOAT: + evpblendmd(dst, kmask, src1, src2, merge, vector_len); + break; + case T_LONG: + case T_DOUBLE: + evpblendmq(dst, kmask, src1, src2, merge, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + //------------------------------------------------------------------------------------------- // IndexOf for constant substrings with size >= 8 chars