< prev index next >

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Print this page
rev 61241 : manual merge with vectorIntrinsics

*** 31,40 **** --- 31,55 ---- #include "opto/opcodes.hpp" #include "runtime/biasedLocking.hpp" #include "runtime/objectMonitor.hpp" #include "runtime/stubRoutines.hpp" + inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { + switch (vlen_in_bytes) { + case 4: // fall-through + case 8: // fall-through + case 16: return Assembler::AVX_128bit; + case 32: return Assembler::AVX_256bit; + case 64: return Assembler::AVX_512bit; + + default: { + ShouldNotReachHere(); + return Assembler::AVX_NoVec; + } + } + } + void C2_MacroAssembler::setvectmask(Register dst, Register src) { guarantee(PostLoopMultiversioning, "must be"); Assembler::movl(dst, 1); Assembler::shlxl(dst, dst, src); Assembler::decl(dst);
*** 852,861 **** --- 867,1037 ---- assert((opcode == Op_NegVF),"opcode should be Op_NegF"); vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); } } + void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { + assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); + + if (opcode == Op_MinV) { + if (elem_bt == T_BYTE) { + pminsb(dst, src); + } else if (elem_bt == T_SHORT) { + pminsw(dst, src); + } else if (elem_bt == T_INT) { + pminsd(dst, src); + } else { + assert(elem_bt == T_LONG, "required"); + assert(tmp == xmm0, "required"); + movdqu(xmm0, dst); + pcmpgtq(xmm0, src); + blendvpd(dst, src); // xmm0 as mask + } + } else { // opcode == Op_MaxV + if (elem_bt == T_BYTE) { + pmaxsb(dst, src); + } else if (elem_bt == T_SHORT) { + pmaxsw(dst, src); + } else if (elem_bt == T_INT) { + pmaxsd(dst, src); + } else { + assert(elem_bt == T_LONG, "required"); + assert(tmp == xmm0, "required"); + movdqu(xmm0, src); + pcmpgtq(xmm0, dst); + blendvpd(dst, src); // xmm0 as mask + } + } + } + + void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister src1, XMMRegister src2, + int vlen_enc) { + assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); + + if (opcode == Op_MinV) { + if (elem_bt == T_BYTE) { + vpminsb(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_SHORT) { + vpminsw(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_INT) { + vpminsd(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_LONG, "required"); + if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + vpminsq(dst, src1, src2, vlen_enc); + } else { + vpcmpgtq(dst, src1, src2, vlen_enc); + vblendvpd(dst, src1, src2, dst, vlen_enc); + } + } + } else { // opcode == Op_MaxV + if (elem_bt == T_BYTE) { + vpmaxsb(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_SHORT) { + vpmaxsw(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_INT) { + vpmaxsd(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_LONG, "required"); + if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + vpmaxsq(dst, src1, src2, vlen_enc); + } else { + vpcmpgtq(dst, src1, src2, vlen_enc); + vblendvpd(dst, src2, src1, dst, vlen_enc); + } + } + } + } + + // Float/Double min max + + void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc) { + assert(UseAVX > 0, "required"); + assert(opcode == Op_MinV || opcode == Op_MinReductionV || + opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); + assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); + + bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); + bool is_double_word = is_double_word_type(elem_bt); + + if (!is_double_word && is_min) { + vblendvps(atmp, a, b, a, vlen_enc); + vblendvps(btmp, b, a, a, vlen_enc); + vminps(tmp, atmp, btmp, vlen_enc); + vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvps(dst, tmp, atmp, btmp, vlen_enc); + } else if (!is_double_word && !is_min) { + vblendvps(btmp, b, a, b, vlen_enc); + vblendvps(atmp, a, b, b, vlen_enc); + vmaxps(tmp, atmp, btmp, vlen_enc); + vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvps(dst, tmp, atmp, btmp, vlen_enc); + } else if (is_double_word && is_min) { + vblendvpd(atmp, a, b, a, vlen_enc); + vblendvpd(btmp, b, a, a, vlen_enc); + vminpd(tmp, atmp, btmp, vlen_enc); + vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvpd(dst, tmp, atmp, btmp, vlen_enc); + } else { + assert(is_double_word && !is_min, "sanity"); + vblendvpd(btmp, b, a, b, vlen_enc); + vblendvpd(atmp, a, b, b, vlen_enc); + vmaxpd(tmp, atmp, btmp, vlen_enc); + vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvpd(dst, tmp, atmp, btmp, vlen_enc); + } + } + + void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + KRegister ktmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc) { + assert(UseAVX > 2, "required"); + assert(opcode == Op_MinV || opcode == Op_MinReductionV || + opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); + assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); + + bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); + bool is_double_word = is_double_word_type(elem_bt); + bool merge = true; + + if (!is_double_word && is_min) { + evpmovd2m(ktmp, a, vlen_enc); + evblendmps(atmp, ktmp, a, b, merge, vlen_enc); + evblendmps(btmp, ktmp, b, a, merge, vlen_enc); + vminps(dst, atmp, btmp, vlen_enc); + evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdqul(dst, ktmp, atmp, merge, vlen_enc); + } else if (!is_double_word && !is_min) { + evpmovd2m(ktmp, b, vlen_enc); + evblendmps(atmp, ktmp, a, b, merge, vlen_enc); + evblendmps(btmp, ktmp, b, a, merge, vlen_enc); + vmaxps(dst, atmp, btmp, vlen_enc); + evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdqul(dst, ktmp, atmp, merge, vlen_enc); + } else if (is_double_word && is_min) { + evpmovq2m(ktmp, a, vlen_enc); + evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); + evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); + vminpd(dst, atmp, btmp, vlen_enc); + evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdquq(dst, ktmp, atmp, merge, vlen_enc); + } else { + assert(is_double_word && !is_min, "sanity"); + evpmovq2m(ktmp, b, vlen_enc); + evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); + evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); + vmaxpd(dst, atmp, btmp, vlen_enc); + evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdquq(dst, ktmp, atmp, merge, vlen_enc); + } + } + void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { if (sign) { pmovsxbw(dst, src); } else { pmovzxbw(dst, src);
*** 868,981 **** } else { vpmovzxbw(dst, src, vector_len); } } ! void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { ! if (opcode == Op_RShiftVI) { ! psrad(dst, src); ! } else if (opcode == Op_LShiftVI) { ! pslld(dst, src); } else { ! assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); ! psrld(dst, src); } } ! void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { ! if (opcode == Op_RShiftVI) { ! vpsrad(dst, nds, src, vector_len); ! } else if (opcode == Op_LShiftVI) { ! vpslld(dst, nds, src, vector_len); } else { ! assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); ! vpsrld(dst, nds, src, vector_len); } } ! void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { ! if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { ! psraw(dst, src); ! } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { ! psllw(dst, src); ! } else { ! assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); ! psrlw(dst, src); } } ! void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { ! if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { ! vpsraw(dst, nds, src, vector_len); ! } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { ! vpsllw(dst, nds, src, vector_len); } else { ! assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); ! vpsrlw(dst, nds, src, vector_len); } } ! void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { ! if (opcode == Op_RShiftVL) { ! psrlq(dst, src); // using srl to implement sra on pre-avs512 systems ! } else if (opcode == Op_LShiftVL) { ! psllq(dst, src); } else { ! assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); ! psrlq(dst, src); } } ! void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { ! if (opcode == Op_RShiftVL) { ! evpsraq(dst, nds, src, vector_len); ! } else if (opcode == Op_LShiftVL) { ! vpsllq(dst, nds, src, vector_len); } else { ! assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); ! vpsrlq(dst, nds, src, vector_len); } } ! // Reductions for vectors of ints, longs, floats, and doubles. ! void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) { int vector_len = Assembler::AVX_128bit; switch (opcode) { case Op_AndReductionV: pand(dst, src); break; case Op_OrReductionV: por (dst, src); break; case Op_XorReductionV: pxor(dst, src); break; ! case Op_AddReductionVF: addss(dst, src); break; case Op_AddReductionVD: addsd(dst, src); break; ! case Op_AddReductionVI: paddd(dst, src); break; case Op_AddReductionVL: paddq(dst, src); break; - case Op_MulReductionVF: mulss(dst, src); break; case Op_MulReductionVD: mulsd(dst, src); break; ! case Op_MulReductionVI: pmulld(dst, src); break; ! case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break; ! default: assert(false, "wrong opcode"); } } ! void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { int vector_len = Assembler::AVX_256bit; switch (opcode) { case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; ! ! case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break; case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; ! ! case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break; case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; - default: assert(false, "wrong opcode"); } } void C2_MacroAssembler::reduce_fp(int opcode, int vlen, --- 1044,1483 ---- } else { vpmovzxbw(dst, src, vector_len); } } ! void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { ! if (sign) { ! vpmovsxbd(dst, src, vector_len); } else { ! vpmovzxbd(dst, src, vector_len); } } ! void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { ! if (sign) { ! vpmovsxwd(dst, src, vector_len); } else { ! vpmovzxwd(dst, src, vector_len); } } ! void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { ! switch (opcode) { ! case Op_RShiftVI: psrad(dst, shift); break; ! case Op_LShiftVI: pslld(dst, shift); break; ! case Op_URShiftVI: psrld(dst, shift); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); } } ! void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { ! switch (opcode) { ! case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; ! case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; ! case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { ! switch (opcode) { ! case Op_RShiftVB: // fall-through ! case Op_RShiftVS: psraw(dst, shift); break; ! ! case Op_LShiftVB: // fall-through ! case Op_LShiftVS: psllw(dst, shift); break; ! ! case Op_URShiftVS: // fall-through ! case Op_URShiftVB: psrlw(dst, shift); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { ! switch (opcode) { ! case Op_RShiftVB: // fall-through ! case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; ! ! case Op_LShiftVB: // fall-through ! case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; ! ! case Op_URShiftVS: // fall-through ! case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { ! switch (opcode) { ! case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems ! case Op_LShiftVL: psllq(dst, shift); break; ! case Op_URShiftVL: psrlq(dst, shift); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { ! switch (opcode) { ! case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; ! case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; ! case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { ! switch (opcode) { ! case Op_RShiftVB: // fall-through ! case Op_RShiftVS: // fall-through ! case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; ! ! case Op_LShiftVB: // fall-through ! case Op_LShiftVS: // fall-through ! case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; ! ! case Op_URShiftVB: // fall-through ! case Op_URShiftVS: // fall-through ! case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { ! switch (opcode) { ! case Op_RShiftVB: // fall-through ! case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; ! ! case Op_LShiftVB: // fall-through ! case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; ! ! case Op_URShiftVB: // fall-through ! case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; ! ! default: assert(false, "%s", NodeClassNames[opcode]); ! } ! } ! ! void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { ! assert(UseAVX >= 2, "required"); ! switch (opcode) { ! case Op_RShiftVL: { ! if (UseAVX > 2) { ! assert(tmp == xnoreg, "not used"); ! if (!VM_Version::supports_avx512vl()) { ! vlen_enc = Assembler::AVX_512bit; ! } ! evpsravq(dst, src, shift, vlen_enc); } else { ! vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); ! vpsrlvq(dst, src, shift, vlen_enc); ! vpsrlvq(tmp, tmp, shift, vlen_enc); ! vpxor(dst, dst, tmp, vlen_enc); ! vpsubq(dst, dst, tmp, vlen_enc); ! } ! break; ! } ! case Op_LShiftVL: { ! assert(tmp == xnoreg, "not used"); ! vpsllvq(dst, src, shift, vlen_enc); ! break; ! } ! case Op_URShiftVL: { ! assert(tmp == xnoreg, "not used"); ! vpsrlvq(dst, src, shift, vlen_enc); ! break; ! } ! default: assert(false, "%s", NodeClassNames[opcode]); } } ! // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst ! void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { ! assert(opcode == Op_LShiftVB || ! opcode == Op_RShiftVB || ! opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); ! bool sign = (opcode != Op_URShiftVB); ! assert(vector_len == 0, "required"); ! vextendbd(sign, dst, src, 1); ! vpmovzxbd(vtmp, shift, 1); ! varshiftd(opcode, dst, dst, vtmp, 1); ! vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); ! vextracti128_high(vtmp, dst); ! vpackusdw(dst, dst, vtmp, 0); ! } ! ! // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst ! void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { ! assert(opcode == Op_LShiftVB || ! opcode == Op_RShiftVB || ! opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); ! bool sign = (opcode != Op_URShiftVB); ! int ext_vector_len = vector_len + 1; ! vextendbw(sign, dst, src, ext_vector_len); ! vpmovzxbw(vtmp, shift, ext_vector_len); ! varshiftw(opcode, dst, dst, vtmp, ext_vector_len); ! vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); ! if (vector_len == 0) { ! vextracti128_high(vtmp, dst); ! vpackuswb(dst, dst, vtmp, vector_len); } else { ! vextracti64x4_high(vtmp, dst); ! vpackuswb(dst, dst, vtmp, vector_len); ! vpermq(dst, dst, 0xD8, vector_len); ! } ! } ! ! void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { ! switch(typ) { ! case T_BYTE: ! pinsrb(dst, val, idx); ! break; ! case T_SHORT: ! pinsrw(dst, val, idx); ! break; ! case T_INT: ! pinsrd(dst, val, idx); ! break; ! case T_LONG: ! pinsrq(dst, val, idx); ! break; ! default: ! assert(false,"Should not reach here."); ! break; ! } ! } ! ! void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { ! switch(typ) { ! case T_BYTE: ! vpinsrb(dst, src, val, idx); ! break; ! case T_SHORT: ! vpinsrw(dst, src, val, idx); ! break; ! case T_INT: ! vpinsrd(dst, src, val, idx); ! break; ! case T_LONG: ! vpinsrq(dst, src, val, idx); ! break; ! default: ! assert(false,"Should not reach here."); ! break; ! } ! } ! ! void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { ! switch(typ) { ! case T_INT: ! vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); ! break; ! case T_FLOAT: ! vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); ! break; ! case T_LONG: ! vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); ! break; ! case T_DOUBLE: ! vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); ! break; ! default: ! assert(false,"Should not reach here."); ! break; ! } ! } ! ! void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { ! switch(typ) { ! case T_INT: ! evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); ! break; ! case T_FLOAT: ! evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); ! break; ! case T_LONG: ! evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); ! break; ! case T_DOUBLE: ! evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); ! break; ! default: ! assert(false,"Should not reach here."); ! break; ! } ! } ! ! void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { ! switch(typ) { ! case T_INT: ! evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); ! break; ! case T_FLOAT: ! evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); ! break; ! case T_LONG: ! evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); ! break; ! case T_DOUBLE: ! evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); ! break; ! default: ! assert(false,"Should not reach here."); ! break; } } ! void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) { ! if (vlen_in_bytes <= 16) { ! pxor (dst, dst); ! psubb(dst, src); ! switch (elem_bt) { ! case T_BYTE: /* nothing to do */ break; ! case T_SHORT: pmovsxbw(dst, dst); break; ! case T_INT: pmovsxbd(dst, dst); break; ! case T_FLOAT: pmovsxbd(dst, dst); break; ! case T_LONG: pmovsxbq(dst, dst); break; ! case T_DOUBLE: pmovsxbq(dst, dst); break; ! ! default: assert(false, "%s", type2name(elem_bt)); ! } } else { ! int vlen_enc = vector_length_encoding(vlen_in_bytes); ! ! vpxor (dst, dst, dst, vlen_enc); ! vpsubb(dst, dst, src, vlen_enc); ! switch (elem_bt) { ! case T_BYTE: /* nothing to do */ break; ! case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; ! case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; ! case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; ! case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; ! case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; ! ! default: assert(false, "%s", type2name(elem_bt)); ! } } } ! void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { ! ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); ! if (vlen_in_bytes <= 16) { ! movdqu(dst, addr, scratch); ! } else if (vlen_in_bytes == 32) { ! vmovdqu(dst, addr, scratch); ! } else { ! assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); ! evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); ! } ! } ! // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. ! void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { int vector_len = Assembler::AVX_128bit; switch (opcode) { case Op_AndReductionV: pand(dst, src); break; case Op_OrReductionV: por (dst, src); break; case Op_XorReductionV: pxor(dst, src); break; ! case Op_MinReductionV: ! switch (typ) { ! case T_BYTE: pminsb(dst, src); break; ! case T_SHORT: pminsw(dst, src); break; ! case T_INT: pminsd(dst, src); break; ! case T_LONG: assert(UseAVX > 2, "required"); ! vpminsq(dst, dst, src, Assembler::AVX_128bit); break; ! default: assert(false, "wrong type"); ! } ! break; ! case Op_MaxReductionV: ! switch (typ) { ! case T_BYTE: pmaxsb(dst, src); break; ! case T_SHORT: pmaxsw(dst, src); break; ! case T_INT: pmaxsd(dst, src); break; ! case T_LONG: assert(UseAVX > 2, "required"); ! vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; ! default: assert(false, "wrong type"); ! } ! break; case Op_AddReductionVF: addss(dst, src); break; case Op_AddReductionVD: addsd(dst, src); break; ! case Op_AddReductionVI: ! switch (typ) { ! case T_BYTE: paddb(dst, src); break; ! case T_SHORT: paddw(dst, src); break; ! case T_INT: paddd(dst, src); break; ! default: assert(false, "wrong type"); ! } ! break; case Op_AddReductionVL: paddq(dst, src); break; case Op_MulReductionVF: mulss(dst, src); break; case Op_MulReductionVD: mulsd(dst, src); break; ! case Op_MulReductionVI: ! switch (typ) { ! case T_SHORT: pmullw(dst, src); break; ! case T_INT: pmulld(dst, src); break; ! default: assert(false, "wrong type"); ! } ! break; ! case Op_MulReductionVL: assert(UseAVX > 2, "required"); ! vpmullq(dst, dst, src, vector_len); break; default: assert(false, "wrong opcode"); } } ! void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { int vector_len = Assembler::AVX_256bit; switch (opcode) { case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; ! case Op_MinReductionV: ! switch (typ) { ! case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; ! case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; ! case T_INT: vpminsd(dst, src1, src2, vector_len); break; ! case T_LONG: assert(UseAVX > 2, "required"); ! vpminsq(dst, src1, src2, vector_len); break; ! default: assert(false, "wrong type"); ! } ! break; ! case Op_MaxReductionV: ! switch (typ) { ! case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; ! case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; ! case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; ! case T_LONG: assert(UseAVX > 2, "required"); ! vpmaxsq(dst, src1, src2, vector_len); break; ! default: assert(false, "wrong type"); ! } ! break; ! case Op_AddReductionVI: ! switch (typ) { ! case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; ! case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; ! case T_INT: vpaddd(dst, src1, src2, vector_len); break; ! default: assert(false, "wrong type"); ! } ! break; case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; ! case Op_MulReductionVI: ! switch (typ) { ! case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; ! case T_INT: vpmulld(dst, src1, src2, vector_len); break; ! default: assert(false, "wrong type"); ! } ! break; case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; default: assert(false, "wrong opcode"); } } void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
*** 994,1003 **** --- 1496,1544 ---- default: assert(false, "wrong opcode"); } } + void C2_MacroAssembler::reduceB(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } + } + + void C2_MacroAssembler::mulreduceB(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } + } + + void C2_MacroAssembler::reduceS(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } + } + void C2_MacroAssembler::reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { switch (vlen) { case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
*** 1065,1078 **** movdqu(vtmp1, src2); } phaddd(vtmp1, vtmp1); } else { pshufd(vtmp1, src2, 0x1); ! reduce_operation_128(opcode, vtmp1, src2); } movdl(vtmp2, src1); ! reduce_operation_128(opcode, vtmp1, vtmp2); movdl(dst, vtmp1); } void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { if (opcode == Op_AddReductionVI) { --- 1606,1619 ---- movdqu(vtmp1, src2); } phaddd(vtmp1, vtmp1); } else { pshufd(vtmp1, src2, 0x1); ! reduce_operation_128(T_INT, opcode, vtmp1, src2); } movdl(vtmp2, src1); ! reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); movdl(dst, vtmp1); } void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { if (opcode == Op_AddReductionVI) {
*** 1081,1091 **** } phaddd(vtmp1, src2); reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { pshufd(vtmp2, src2, 0xE); ! reduce_operation_128(opcode, vtmp2, src2); reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } } void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { --- 1622,1632 ---- } phaddd(vtmp1, src2); reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { pshufd(vtmp2, src2, 0xE); ! reduce_operation_128(T_INT, opcode, vtmp2, src2); reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } } void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
*** 1094,1148 **** vextracti128_high(vtmp2, vtmp1); vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { vextracti128_high(vtmp1, src2); ! reduce_operation_128(opcode, vtmp1, src2); reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } } void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); ! reduce_operation_256(opcode, vtmp2, vtmp2, src2); reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } #ifdef _LP64 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { pshufd(vtmp2, src2, 0xE); ! reduce_operation_128(opcode, vtmp2, src2); movdq(vtmp1, src1); ! reduce_operation_128(opcode, vtmp1, vtmp2); movdq(dst, vtmp1); } void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti128_high(vtmp1, src2); ! reduce_operation_128(opcode, vtmp1, src2); reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); ! reduce_operation_256(opcode, vtmp2, vtmp2, src2); reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } #endif // _LP64 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { ! reduce_operation_128(opcode, dst, src); pshufd(vtmp, src, 0x1); ! reduce_operation_128(opcode, dst, vtmp); } void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { reduce2F(opcode, dst, src, vtmp); pshufd(vtmp, src, 0x2); ! reduce_operation_128(opcode, dst, vtmp); pshufd(vtmp, src, 0x3); ! reduce_operation_128(opcode, dst, vtmp); } void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { reduce4F(opcode, dst, src, vtmp2); vextractf128_high(vtmp2, src); --- 1635,1814 ---- vextracti128_high(vtmp2, vtmp1); vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { vextracti128_high(vtmp1, src2); ! reduce_operation_128(T_INT, opcode, vtmp1, src2); reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } } void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); ! reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } + void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pshufd(vtmp2, src2, 0x1); + reduce_operation_128(T_BYTE, opcode, vtmp2, src2); + movdqu(vtmp1, vtmp2); + psrldq(vtmp1, 2); + reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); + movdqu(vtmp2, vtmp1); + psrldq(vtmp2, 1); + reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); + movdl(vtmp2, src1); + pmovsxbd(vtmp1, vtmp1); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); + pextrb(dst, vtmp1, 0x0); + movsbl(dst, dst); + } + + void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pshufd(vtmp1, src2, 0xE); + reduce_operation_128(T_BYTE, opcode, vtmp1, src2); + reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } + + void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + vextracti128_high(vtmp2, src2); + reduce_operation_128(T_BYTE, opcode, vtmp2, src2); + reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); + } + + void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + vextracti64x4_high(vtmp1, src2); + reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); + reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } + + void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); + } + + void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (UseAVX > 1) { + int vector_len = Assembler::AVX_256bit; + vpmovsxbw(vtmp1, src2, vector_len); + reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } else { + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); + pshufd(vtmp2, src2, 0x1); + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } + } + + void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (UseAVX > 2 && VM_Version::supports_avx512bw()) { + int vector_len = Assembler::AVX_512bit; + vpmovsxbw(vtmp1, src2, vector_len); + reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } else { + assert(UseAVX >= 2,"Should not reach here."); + mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); + vextracti128_high(vtmp2, src2); + mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } + } + + void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); + vextracti64x4_high(vtmp2, src2); + mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } + + void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + if (vtmp1 != src2) { + movdqu(vtmp1, src2); + } + phaddw(vtmp1, vtmp1); + phaddw(vtmp1, vtmp1); + } else { + pshufd(vtmp2, src2, 0x1); + reduce_operation_128(T_SHORT, opcode, vtmp2, src2); + movdqu(vtmp1, vtmp2); + psrldq(vtmp1, 2); + reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); + } + movdl(vtmp2, src1); + pmovsxwd(vtmp1, vtmp1); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); + pextrw(dst, vtmp1, 0x0); + movswl(dst, dst); + } + + void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + if (vtmp1 != src2) { + movdqu(vtmp1, src2); + } + phaddw(vtmp1, src2); + } else { + pshufd(vtmp1, src2, 0xE); + reduce_operation_128(T_SHORT, opcode, vtmp1, src2); + } + reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } + + void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + int vector_len = Assembler::AVX_256bit; + vphaddw(vtmp2, src2, src2, vector_len); + vpermq(vtmp2, vtmp2, 0xD8, vector_len); + } else { + vextracti128_high(vtmp2, src2); + reduce_operation_128(T_SHORT, opcode, vtmp2, src2); + } + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); + } + + void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + int vector_len = Assembler::AVX_256bit; + vextracti64x4_high(vtmp1, src2); + reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); + reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } + #ifdef _LP64 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { pshufd(vtmp2, src2, 0xE); ! reduce_operation_128(T_LONG, opcode, vtmp2, src2); movdq(vtmp1, src1); ! reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); movdq(dst, vtmp1); } void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti128_high(vtmp1, src2); ! reduce_operation_128(T_LONG, opcode, vtmp1, src2); reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); ! reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } #endif // _LP64 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { ! reduce_operation_128(T_FLOAT, opcode, dst, src); pshufd(vtmp, src, 0x1); ! reduce_operation_128(T_FLOAT, opcode, dst, vtmp); } void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { reduce2F(opcode, dst, src, vtmp); pshufd(vtmp, src, 0x2); ! reduce_operation_128(T_FLOAT, opcode, dst, vtmp); pshufd(vtmp, src, 0x3); ! reduce_operation_128(T_FLOAT, opcode, dst, vtmp); } void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { reduce4F(opcode, dst, src, vtmp2); vextractf128_high(vtmp2, src);
*** 1154,1166 **** vextracti64x4_high(vtmp1, src); reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); } void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { ! reduce_operation_128(opcode, dst, src); pshufd(vtmp, src, 0xE); ! reduce_operation_128(opcode, dst, vtmp); } void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { reduce2D(opcode, dst, src, vtmp2); vextractf128_high(vtmp2, src); --- 1820,1832 ---- vextracti64x4_high(vtmp1, src); reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); } void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { ! reduce_operation_128(T_DOUBLE, opcode, dst, src); pshufd(vtmp, src, 0xE); ! reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); } void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { reduce2D(opcode, dst, src, vtmp2); vextractf128_high(vtmp2, src);
*** 1171,1180 **** --- 1837,2047 ---- reduce4D(opcode, dst, src, vtmp1, vtmp2); vextracti64x4_high(vtmp1, src); reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); } + void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + XMMRegister xmm_0, XMMRegister xmm_1) { + int permconst[] = {1, 14}; + XMMRegister wsrc = src; + XMMRegister wdst = xmm_0; + XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; + + int vlen_enc = Assembler::AVX_128bit; + if (vlen == 16) { + vlen_enc = Assembler::AVX_256bit; + } + + for (int i = log2(vlen) - 1; i >=0; i--) { + if (i == 0 && !is_dst_valid) { + wdst = dst; + } + if (i == 3) { + vextracti64x4_high(wtmp, wsrc); + } else if (i == 2) { + vextracti128_high(wtmp, wsrc); + } else { // i = [0,1] + vpermilps(wtmp, wsrc, permconst[i], vlen_enc); + } + vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); + wsrc = wdst; + vlen_enc = Assembler::AVX_128bit; + } + if (is_dst_valid) { + vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); + } + } + + void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + XMMRegister xmm_0, XMMRegister xmm_1) { + XMMRegister wsrc = src; + XMMRegister wdst = xmm_0; + XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; + int vlen_enc = Assembler::AVX_128bit; + if (vlen == 8) { + vlen_enc = Assembler::AVX_256bit; + } + for (int i = log2(vlen) - 1; i >=0; i--) { + if (i == 0 && !is_dst_valid) { + wdst = dst; + } + if (i == 1) { + vextracti128_high(wtmp, wsrc); + } else if (i == 2) { + vextracti64x4_high(wtmp, wsrc); + } else { + assert(i == 0, "%d", i); + vpermilpd(wtmp, wsrc, 1, vlen_enc); + } + vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); + wsrc = wdst; + vlen_enc = Assembler::AVX_128bit; + } + if (is_dst_valid) { + vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); + } + } + + void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { + switch (bt) { + case T_BYTE: pextrb(dst, src, idx); break; + case T_SHORT: pextrw(dst, src, idx); break; + case T_INT: pextrd(dst, src, idx); break; + case T_LONG: pextrq(dst, src, idx); break; + + default: + assert(false,"Should not reach here."); + break; + } + } + + XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int lane = elemindex / elem_per_lane; + int eindex = elemindex % elem_per_lane; + + if (lane >= 2) { + assert(UseAVX > 2, "required"); + vextractf32x4(dst, src, lane & 3); + return dst; + } else if (lane > 0) { + assert(UseAVX > 0, "required"); + vextractf128(dst, src, lane); + return dst; + } else { + return src; + } + } + + void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int eindex = elemindex % elem_per_lane; + assert(is_integral_type(typ),"required"); + + if (eindex == 0) { + if (typ == T_LONG) { + movq(dst, src); + } else { + movdl(dst, src); + if (typ == T_BYTE) + movsbl(dst, dst); + else if (typ == T_SHORT) + movswl(dst, dst); + } + } else { + extract(typ, dst, src, eindex); + } + } + + void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int eindex = elemindex % elem_per_lane; + assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); + + if (eindex == 0) { + movq(dst, src); + } else { + if (typ == T_FLOAT) { + if (UseAVX == 0) { + movdqu(dst, src); + pshufps(dst, dst, eindex); + } else { + vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); + } + } else { + if (UseAVX == 0) { + movdqu(dst, src); + psrldq(dst, eindex*esize); + } else { + vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); + } + movq(dst, dst); + } + } + // Zero upper bits + if (typ == T_FLOAT) { + if (UseAVX == 0) { + assert((vtmp != xnoreg) && (tmp != noreg), "required."); + movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); + pand(dst, vtmp); + } else { + assert((tmp != noreg), "required."); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); + } + } + } + + void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { + switch(typ) { + case T_BYTE: + evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_SHORT: + evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_INT: + case T_FLOAT: + evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_LONG: + case T_DOUBLE: + evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + default: + assert(false,"Should not reach here."); + break; + } + } + + void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + switch(typ) { + case T_BYTE: + evpblendmb(dst, kmask, src1, src2, merge, vector_len); + break; + case T_SHORT: + evpblendmw(dst, kmask, src1, src2, merge, vector_len); + break; + case T_INT: + case T_FLOAT: + evpblendmd(dst, kmask, src1, src2, merge, vector_len); + break; + case T_LONG: + case T_DOUBLE: + evpblendmq(dst, kmask, src1, src2, merge, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } + } + //------------------------------------------------------------------------------------------- // IndexOf for constant substrings with size >= 8 chars // which don't need to be loaded through stack. void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
< prev index next >