1080 XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p,
1081 XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p,
1082 XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p,
1083 XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p,
1084 XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p
1085 #ifdef _LP64
1086 ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p,
1087 XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p,
1088 XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089 XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090 XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091 XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092 XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093 XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095 );
1096
1097 reg_class_dynamic vectorz_reg (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099
1100 %}
1101
1102
1103 //----------SOURCE BLOCK-------------------------------------------------------
1104 // This is a block of C++ code which provides values, functions, and
1105 // definitions necessary in the rest of the architecture description
1106
1107 source_hpp %{
1108 // Header information of the source block.
1109 // Method declarations/definitions which are used outside
1110 // the ad-scope can conveniently be defined here.
1111 //
1112 // To keep related declarations/definitions/uses close together,
1113 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1114
1115 class NativeJump;
1116
1117 class CallStubImpl {
1118
1119 //--------------------------------------------------------------
1148 return NativeJump::instruction_size;
1149 }
1150
1151 #ifdef _LP64
1152 static uint size_deopt_handler() {
1153 // three 5 byte instructions plus one move for unreachable address.
1154 return 15+3;
1155 }
1156 #else
1157 static uint size_deopt_handler() {
1158 // NativeCall instruction size is the same as NativeJump.
1159 // exception handler starts out as jump and can be patched to
1160 // a call be deoptimization. (4932387)
1161 // Note that this value is also credited (in output.cpp) to
1162 // the size of the code section.
1163 return 5 + NativeJump::instruction_size; // pushl(); jmp;
1164 }
1165 #endif
1166 };
1167
1168 %} // end source_hpp
1169
1170 source %{
1171
1172 #include "opto/addnode.hpp"
1173
1174 // Emit exception handler code.
1175 // Stuff framesize into a register and call a VM stub routine.
1176 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1177
1178 // Note that the code buffer's insts_mark is always relative to insts.
1179 // That's why we must use the macroassembler to generate a handler.
1180 C2_MacroAssembler _masm(&cbuf);
1181 address base = __ start_a_stub(size_exception_handler());
1182 if (base == NULL) {
1183 ciEnv::current()->record_failure("CodeCache is full");
1184 return 0; // CodeBuffer::expand failed
1185 }
1186 int offset = __ offset();
1187 __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1208 Label next;
1209 // push a "the_pc" on the stack without destroying any registers
1210 // as they all may be live.
1211
1212 // push address of "next"
1213 __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1214 __ bind(next);
1215 // adjust it so it matches "the_pc"
1216 __ subptr(Address(rsp, 0), __ offset() - offset);
1217 #else
1218 InternalAddress here(__ pc());
1219 __ pushptr(here.addr());
1220 #endif
1221
1222 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1223 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1224 __ end_a_stub();
1225 return offset;
1226 }
1227
1228
1229 //=============================================================================
1230
1231 // Float masks come from different places depending on platform.
1232 #ifdef _LP64
1233 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); }
1234 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); }
1235 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1236 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1237 #else
1238 static address float_signmask() { return (address)float_signmask_pool; }
1239 static address float_signflip() { return (address)float_signflip_pool; }
1240 static address double_signmask() { return (address)double_signmask_pool; }
1241 static address double_signflip() { return (address)double_signflip_pool; }
1242 #endif
1243 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1244 static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1245 static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1246
1247 //=============================================================================
1248 const bool Matcher::match_rule_supported(int opcode) {
1249 if (!has_match_rule(opcode)) {
1250 return false; // no match rule present
1251 }
1252 switch (opcode) {
1253 case Op_AbsVL:
1254 if (UseAVX < 3) {
1255 return false;
1256 }
1257 break;
1258 case Op_PopCountI:
1259 case Op_PopCountL:
1260 if (!UsePopCountInstruction) {
1261 return false;
1262 }
1263 break;
1264 case Op_PopCountVI:
1265 if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1266 return false;
1267 }
1268 break;
1269 case Op_MulVI:
1270 if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1271 return false;
1272 }
1273 break;
1274 case Op_MulVL:
1275 case Op_MulReductionVL:
1276 if (VM_Version::supports_avx512dq() == false) {
1277 return false;
1278 }
1279 break;
1280 case Op_AbsVB:
1281 case Op_AbsVS:
1282 case Op_AbsVI:
1283 case Op_AddReductionVI:
1284 case Op_AndReductionV:
1285 case Op_OrReductionV:
1286 case Op_XorReductionV:
1287 if (UseSSE < 3) { // requires at least SSSE3
1288 return false;
1289 }
1290 break;
1291 case Op_MulReductionVI:
1292 if (UseSSE < 4) { // requires at least SSE4
1293 return false;
1294 }
1295 break;
1296 case Op_SqrtVD:
1297 case Op_SqrtVF:
1298 if (UseAVX < 1) { // enabled for AVX only
1299 return false;
1300 }
1301 break;
1302 case Op_CompareAndSwapL:
1303 #ifdef _LP64
1304 case Op_CompareAndSwapP:
1305 #endif
1306 if (!VM_Version::supports_cx8()) {
1307 return false;
1308 }
1309 break;
1310 case Op_CMoveVF:
1311 case Op_CMoveVD:
1312 if (UseAVX < 1 || UseAVX > 2) {
1313 return false;
1314 }
1315 break;
1316 case Op_StrIndexOf:
1317 if (!UseSSE42Intrinsics) {
1318 return false;
1319 }
1320 break;
1321 case Op_StrIndexOfChar:
1322 if (!UseSSE42Intrinsics) {
1323 return false;
1324 }
1325 break;
1326 case Op_OnSpinWait:
1327 if (VM_Version::supports_on_spin_wait() == false) {
1328 return false;
1329 }
1330 break;
1331 case Op_MulVB:
1332 case Op_LShiftVB:
1333 case Op_RShiftVB:
1334 case Op_URShiftVB:
1335 if (UseSSE < 4) {
1336 return false;
1337 }
1338 break;
1339 #ifdef _LP64
1340 case Op_MaxD:
1341 case Op_MaxF:
1342 case Op_MinD:
1343 case Op_MinF:
1344 if (UseAVX < 1) { // enabled for AVX only
1345 return false;
1346 }
1347 break;
1348 #endif
1349 case Op_CacheWB:
1350 case Op_CacheWBPreSync:
1351 case Op_CacheWBPostSync:
1352 if (!VM_Version::supports_data_cache_line_flush()) {
1353 return false;
1354 }
1355 break;
1356 case Op_RoundDoubleMode:
1357 if (UseSSE < 4) {
1358 return false;
1359 }
1360 break;
1361 case Op_RoundDoubleModeV:
1362 if (VM_Version::supports_avx() == false) {
1363 return false; // 128bit vroundpd is not available
1364 }
1365 break;
1366 #ifndef _LP64
1367 case Op_AddReductionVF:
1368 case Op_AddReductionVD:
1369 case Op_MulReductionVF:
1370 case Op_MulReductionVD:
1371 if (UseSSE < 1) { // requires at least SSE
1372 return false;
1373 }
1374 break;
1375 case Op_MulAddVS2VI:
1376 case Op_RShiftVL:
1377 case Op_AbsVD:
1378 case Op_NegVD:
1379 if (UseSSE < 2) {
1380 return false;
1381 }
1382 break;
1383 #endif // !LP64
1384 }
1385 return true; // Match rules are supported by default.
1386 }
1387
1388 //------------------------------------------------------------------------
1389
1390 // Identify extra cases that we might want to provide match rules for vector nodes and
1391 // other intrinsics guarded with vector length (vlen) and element type (bt).
1392 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1393 if (!match_rule_supported(opcode)) {
1394 return false;
1395 }
1396 // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1397 // * SSE2 supports 128bit vectors for all types;
1398 // * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1399 // * AVX2 supports 256bit vectors for all types;
1400 // * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1401 // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1402 // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1403 // And MaxVectorSize is taken into account as well.
1404 if (!vector_size_supported(bt, vlen)) {
1405 return false;
1406 }
1407 // Special cases which require vector length follow:
1408 // * implementation limitations
1409 // * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1410 // * 128bit vroundpd instruction is present only in AVX1
1411 switch (opcode) {
1412 case Op_AbsVF:
1413 case Op_NegVF:
1414 if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1415 return false; // 512bit vandps and vxorps are not available
1416 }
1417 break;
1418 case Op_AbsVD:
1419 case Op_NegVD:
1420 if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1421 return false; // 512bit vandpd and vxorpd are not available
1422 }
1423 break;
1424 case Op_CMoveVF:
1425 if (vlen != 8) {
1426 return false; // implementation limitation (only vcmov8F_reg is present)
1427 }
1428 break;
1429 case Op_CMoveVD:
1430 if (vlen != 4) {
1431 return false; // implementation limitation (only vcmov4D_reg is present)
1432 }
1433 break;
1434 }
1435 return true; // Per default match rules are supported.
1436 }
1437
1438 // x86 supports generic vector operands: vec and legVec.
1439 const bool Matcher::supports_generic_vector_operands = true;
1440
1441 MachOper* Matcher::specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1442 assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1443 bool legacy = (generic_opnd->opcode() == LEGVEC);
1444 if (!VM_Version::supports_avx512vlbwdq() && // KNL
1445 is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1446 // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1447 return new legVecZOper();
1448 }
1449 if (legacy) {
1450 switch (ideal_reg) {
1451 case Op_VecS: return new legVecSOper();
1452 case Op_VecD: return new legVecDOper();
1453 case Op_VecX: return new legVecXOper();
1651 } else {
1652 mstack.push(adr, Pre_Visit);
1653 }
1654
1655 // Clone X+offset as it also folds into most addressing expressions
1656 mstack.push(off, Visit);
1657 mstack.push(m->in(AddPNode::Base), Pre_Visit);
1658 return true;
1659 } else if (clone_shift(off, this, mstack, address_visited)) {
1660 address_visited.test_set(m->_idx); // Flag as address_visited
1661 mstack.push(m->in(AddPNode::Address), Pre_Visit);
1662 mstack.push(m->in(AddPNode::Base), Pre_Visit);
1663 return true;
1664 }
1665 return false;
1666 }
1667
1668 void Compile::reshape_address(AddPNode* addp) {
1669 }
1670
1671 static inline uint vector_length(const MachNode* n) {
1672 const TypeVect* vt = n->bottom_type()->is_vect();
1673 return vt->length();
1674 }
1675
1676 static inline uint vector_length(const MachNode* use, MachOper* opnd) {
1677 uint def_idx = use->operand_index(opnd);
1678 Node* def = use->in(def_idx);
1679 return def->bottom_type()->is_vect()->length();
1680 }
1681
1682 static inline uint vector_length_in_bytes(const MachNode* n) {
1683 const TypeVect* vt = n->bottom_type()->is_vect();
1684 return vt->length_in_bytes();
1685 }
1686
1687 static inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1688 uint def_idx = use->operand_index(opnd);
1689 Node* def = use->in(def_idx);
1690 return def->bottom_type()->is_vect()->length_in_bytes();
1691 }
1692
1693 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* n) {
1694 switch(vector_length_in_bytes(n)) {
1695 case 4: // fall-through
1696 case 8: // fall-through
1697 case 16: return Assembler::AVX_128bit;
1698 case 32: return Assembler::AVX_256bit;
1699 case 64: return Assembler::AVX_512bit;
1700
1701 default: {
1702 ShouldNotReachHere();
1703 return Assembler::AVX_NoVec;
1704 }
1705 }
1706 }
1707
1708 // Helper methods for MachSpillCopyNode::implementation().
1709 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1710 int src_hi, int dst_hi, uint ireg, outputStream* st) {
1711 // In 64-bit VM size calculation is very complex. Emitting instructions
1712 // into scratch buffer is used to get size in 64-bit VM.
1713 LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1714 assert(ireg == Op_VecS || // 32bit vector
1715 (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1716 (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1717 "no non-adjacent vector moves" );
1718 if (cbuf) {
1719 C2_MacroAssembler _masm(cbuf);
1720 int offset = __ offset();
1721 switch (ireg) {
1722 case Op_VecS: // copy whole register
1723 case Op_VecD:
1724 case Op_VecX:
2011 %}
2012
2013 encode %{
2014
2015 enc_class call_epilog %{
2016 if (VerifyStackAtCalls) {
2017 // Check that stack depth is unchanged: find majik cookie on stack
2018 int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2019 C2_MacroAssembler _masm(&cbuf);
2020 Label L;
2021 __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2022 __ jccb(Assembler::equal, L);
2023 // Die if stack mismatch
2024 __ int3();
2025 __ bind(L);
2026 }
2027 %}
2028
2029 %}
2030
2031
2032 //----------OPERANDS-----------------------------------------------------------
2033 // Operand definitions must precede instruction definitions for correct parsing
2034 // in the ADLC because operands constitute user defined types which are used in
2035 // instruction definitions.
2036
2037 // Vectors
2038
2039 // Dummy generic vector class. Should be used for all vector operands.
2040 // Replaced with vec[SDXYZ] during post-selection pass.
2041 operand vec() %{
2042 constraint(ALLOC_IN_RC(dynamic));
2043 match(VecX);
2044 match(VecY);
2045 match(VecZ);
2046 match(VecS);
2047 match(VecD);
2048
2049 format %{ %}
2050 interface(REG_INTER);
2775 ins_pipe(pipe_slow);
2776 %}
2777
2778 instruct absF_reg(regF dst) %{
2779 predicate((UseSSE>=1) && (UseAVX == 0));
2780 match(Set dst (AbsF dst));
2781 ins_cost(150);
2782 format %{ "andps $dst, [0x7fffffff]\t# abs float by sign masking" %}
2783 ins_encode %{
2784 __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2785 %}
2786 ins_pipe(pipe_slow);
2787 %}
2788
2789 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2790 predicate(UseAVX > 0);
2791 match(Set dst (AbsF src));
2792 ins_cost(150);
2793 format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2794 ins_encode %{
2795 int vector_len = 0;
2796 __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2797 ExternalAddress(float_signmask()), vector_len);
2798 %}
2799 ins_pipe(pipe_slow);
2800 %}
2801
2802 instruct absD_reg(regD dst) %{
2803 predicate((UseSSE>=2) && (UseAVX == 0));
2804 match(Set dst (AbsD dst));
2805 ins_cost(150);
2806 format %{ "andpd $dst, [0x7fffffffffffffff]\t"
2807 "# abs double by sign masking" %}
2808 ins_encode %{
2809 __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2810 %}
2811 ins_pipe(pipe_slow);
2812 %}
2813
2814 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2815 predicate(UseAVX > 0);
2816 match(Set dst (AbsD src));
2817 ins_cost(150);
2818 format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t"
2819 "# abs double by sign masking" %}
2820 ins_encode %{
2821 int vector_len = 0;
2822 __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2823 ExternalAddress(double_signmask()), vector_len);
2824 %}
2825 ins_pipe(pipe_slow);
2826 %}
2827
2828 instruct negF_reg(regF dst) %{
2829 predicate((UseSSE>=1) && (UseAVX == 0));
2830 match(Set dst (NegF dst));
2831 ins_cost(150);
2832 format %{ "xorps $dst, [0x80000000]\t# neg float by sign flipping" %}
2833 ins_encode %{
2834 __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2835 %}
2836 ins_pipe(pipe_slow);
2837 %}
2838
2839 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2840 predicate(UseAVX > 0);
2841 match(Set dst (NegF src));
2842 ins_cost(150);
2843 format %{ "vnegatess $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2927
2928 format %{ "sqrtsd $dst, $src" %}
2929 ins_cost(150);
2930 ins_encode %{
2931 __ sqrtsd($dst$$XMMRegister, $src$$Address);
2932 %}
2933 ins_pipe(pipe_slow);
2934 %}
2935
2936 instruct sqrtD_imm(regD dst, immD con) %{
2937 predicate(UseSSE>=2);
2938 match(Set dst (SqrtD con));
2939 format %{ "sqrtsd $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2940 ins_cost(150);
2941 ins_encode %{
2942 __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2943 %}
2944 ins_pipe(pipe_slow);
2945 %}
2946
2947
2948 #ifdef _LP64
2949 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
2950 match(Set dst (RoundDoubleMode src rmode));
2951 format %{ "roundsd $dst,$src" %}
2952 ins_cost(150);
2953 ins_encode %{
2954 assert(UseSSE >= 4, "required");
2955 __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
2956 %}
2957 ins_pipe(pipe_slow);
2958 %}
2959
2960 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
2961 match(Set dst (RoundDoubleMode (LoadD src) rmode));
2962 format %{ "roundsd $dst,$src" %}
2963 ins_cost(150);
2964 ins_encode %{
2965 assert(UseSSE >= 4, "required");
2966 __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
2967 %}
2968 ins_pipe(pipe_slow);
2969 %}
2970
2971 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
2972 match(Set dst (RoundDoubleMode con rmode));
2973 effect(TEMP scratch_reg);
2974 format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
2975 ins_cost(150);
2976 ins_encode %{
2977 assert(UseSSE >= 4, "required");
2978 __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
2979 %}
2980 ins_pipe(pipe_slow);
2981 %}
2982
2983 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
2984 predicate(n->as_Vector()->length() < 8);
2985 match(Set dst (RoundDoubleModeV src rmode));
2986 format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
2987 ins_encode %{
2988 assert(UseAVX > 0, "required");
2989 int vector_len = vector_length_encoding(this);
2990 __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vector_len);
2991 %}
2992 ins_pipe( pipe_slow );
2993 %}
2994
2995 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
2996 predicate(n->as_Vector()->length() == 8);
2997 match(Set dst (RoundDoubleModeV src rmode));
2998 format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
2999 ins_encode %{
3000 assert(UseAVX > 2, "required");
3001 __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3002 %}
3003 ins_pipe( pipe_slow );
3004 %}
3005
3006 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3007 predicate(n->as_Vector()->length() < 8);
3008 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3009 format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3010 ins_encode %{
3011 assert(UseAVX > 0, "required");
3012 int vector_len = vector_length_encoding(this);
3013 __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vector_len);
3014 %}
3015 ins_pipe( pipe_slow );
3016 %}
3017
3018 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3019 predicate(n->as_Vector()->length() == 8);
3020 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3021 format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3022 ins_encode %{
3023 assert(UseAVX > 2, "required");
3024 __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3025 %}
3026 ins_pipe( pipe_slow );
3027 %}
3028 #endif // _LP64
3029
3030 instruct onspinwait() %{
3031 match(OnSpinWait);
3032 ins_cost(200);
3033
3034 format %{
3035 $$template
3036 $$emit$$"pause\t! membar_onspinwait"
3037 %}
3038 ins_encode %{
3039 __ pause();
3071 instruct MoveVec2Leg(legVec dst, vec src) %{
3072 match(Set dst src);
3073 format %{ "" %}
3074 ins_encode %{
3075 ShouldNotReachHere();
3076 %}
3077 ins_pipe( fpu_reg_reg );
3078 %}
3079
3080 instruct MoveLeg2Vec(vec dst, legVec src) %{
3081 match(Set dst src);
3082 format %{ "" %}
3083 ins_encode %{
3084 ShouldNotReachHere();
3085 %}
3086 ins_pipe( fpu_reg_reg );
3087 %}
3088
3089 // ============================================================================
3090
3091 // Load vectors
3092 instruct loadV(vec dst, memory mem) %{
3093 match(Set dst (LoadVector mem));
3094 ins_cost(125);
3095 format %{ "load_vector $dst,$mem" %}
3096 ins_encode %{
3097 switch (vector_length_in_bytes(this)) {
3098 case 4: __ movdl ($dst$$XMMRegister, $mem$$Address); break;
3099 case 8: __ movq ($dst$$XMMRegister, $mem$$Address); break;
3100 case 16: __ movdqu ($dst$$XMMRegister, $mem$$Address); break;
3101 case 32: __ vmovdqu ($dst$$XMMRegister, $mem$$Address); break;
3102 case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3103 default: ShouldNotReachHere();
3104 }
3105 %}
3106 ins_pipe( pipe_slow );
3107 %}
3108
3109 // Store vectors generic operand pattern.
3110 instruct storeV(memory mem, vec src) %{
3111 match(Set mem (StoreVector mem src));
3112 ins_cost(145);
3113 format %{ "store_vector $mem,$src\n\t" %}
3114 ins_encode %{
3115 switch (vector_length_in_bytes(this, $src)) {
3116 case 4: __ movdl ($mem$$Address, $src$$XMMRegister); break;
3117 case 8: __ movq ($mem$$Address, $src$$XMMRegister); break;
3118 case 16: __ movdqu ($mem$$Address, $src$$XMMRegister); break;
3119 case 32: __ vmovdqu ($mem$$Address, $src$$XMMRegister); break;
3120 case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3121 default: ShouldNotReachHere();
3122 }
3123 %}
3124 ins_pipe( pipe_slow );
3125 %}
3126
3127 // ====================REPLICATE=======================================
3128
3129 // Replicate byte scalar to be vector
3130 instruct ReplB_reg(vec dst, rRegI src) %{
3131 match(Set dst (ReplicateB src));
3132 format %{ "replicateB $dst,$src" %}
3133 ins_encode %{
3134 uint vlen = vector_length(this);
3135 if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3136 assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3137 int vlen_enc = vector_length_encoding(this);
3138 __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3139 } else {
3140 __ movdl($dst$$XMMRegister, $src$$Register);
3141 __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3142 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3143 if (vlen >= 16) {
3144 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3145 if (vlen >= 32) {
3146 assert(vlen == 32, "sanity");
3147 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3148 }
3149 }
3150 }
3151 %}
3152 ins_pipe( pipe_slow );
3153 %}
3154
3155 instruct ReplB_mem(vec dst, memory mem) %{
3156 predicate(VM_Version::supports_avx2());
3157 match(Set dst (ReplicateB (LoadB mem)));
3158 format %{ "replicateB $dst,$mem" %}
3159 ins_encode %{
3160 int vector_len = vector_length_encoding(this);
3161 __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3162 %}
3163 ins_pipe( pipe_slow );
3164 %}
3165
3166 instruct ReplB_imm(vec dst, immI con) %{
3167 match(Set dst (ReplicateB con));
3168 format %{ "replicateB $dst,$con" %}
3169 ins_encode %{
3170 uint vlen = vector_length(this);
3171 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3172 if (vlen == 4) {
3173 __ movdl($dst$$XMMRegister, const_addr);
3174 } else {
3175 __ movq($dst$$XMMRegister, const_addr);
3176 if (vlen >= 16) {
3177 if (VM_Version::supports_avx2()) {
3178 int vlen_enc = vector_length_encoding(this);
3179 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3180 } else {
3181 assert(vlen == 16, "sanity");
3182 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3183 }
3184 }
3185 }
3186 %}
3187 ins_pipe( pipe_slow );
3188 %}
3189
3190 // Replicate byte scalar zero to be vector
3191 instruct ReplB_zero(vec dst, immI0 zero) %{
3192 match(Set dst (ReplicateB zero));
3193 format %{ "replicateB $dst,$zero" %}
3194 ins_encode %{
3195 uint vlen = vector_length(this);
3196 if (vlen <= 16) {
3197 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3198 } else {
3199 // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3200 int vlen_enc = vector_length_encoding(this);
3201 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3202 }
3203 %}
3204 ins_pipe( fpu_reg_reg );
3205 %}
3206
3207 // ====================ReplicateS=======================================
3208
3209 instruct ReplS_reg(vec dst, rRegI src) %{
3210 match(Set dst (ReplicateS src));
3211 format %{ "replicateS $dst,$src" %}
3248 uint vlen = vector_length(this);
3249 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3250 if (vlen == 2) {
3251 __ movdl($dst$$XMMRegister, const_addr);
3252 } else {
3253 __ movq($dst$$XMMRegister, const_addr);
3254 if (vlen >= 8) {
3255 if (VM_Version::supports_avx2()) {
3256 int vlen_enc = vector_length_encoding(this);
3257 __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3258 } else {
3259 assert(vlen == 8, "sanity");
3260 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3261 }
3262 }
3263 }
3264 %}
3265 ins_pipe( fpu_reg_reg );
3266 %}
3267
3268 instruct ReplS_zero(vec dst, immI0 zero) %{
3269 match(Set dst (ReplicateS zero));
3270 format %{ "replicateS $dst,$zero" %}
3271 ins_encode %{
3272 uint vlen = vector_length(this);
3273 if (vlen <= 8) {
3274 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3275 } else {
3276 int vlen_enc = vector_length_encoding(this);
3277 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3278 }
3279 %}
3280 ins_pipe( fpu_reg_reg );
3281 %}
3282
3283 // ====================ReplicateI=======================================
3284
3285 instruct ReplI_reg(vec dst, rRegI src) %{
3286 match(Set dst (ReplicateI src));
3287 format %{ "replicateI $dst,$src" %}
3288 ins_encode %{
3295 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3296 if (vlen >= 8) {
3297 assert(vlen == 8, "sanity");
3298 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3299 }
3300 }
3301 %}
3302 ins_pipe( pipe_slow );
3303 %}
3304
3305 instruct ReplI_mem(vec dst, memory mem) %{
3306 match(Set dst (ReplicateI (LoadI mem)));
3307 format %{ "replicateI $dst,$mem" %}
3308 ins_encode %{
3309 uint vlen = vector_length(this);
3310 if (vlen <= 4) {
3311 __ movdl($dst$$XMMRegister, $mem$$Address);
3312 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3313 } else {
3314 assert(VM_Version::supports_avx2(), "sanity");
3315 int vector_len = vector_length_encoding(this);
3316 __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3317 }
3318 %}
3319 ins_pipe( pipe_slow );
3320 %}
3321
3322 instruct ReplI_imm(vec dst, immI con) %{
3323 match(Set dst (ReplicateI con));
3324 format %{ "replicateI $dst,$con" %}
3325 ins_encode %{
3326 uint vlen = vector_length(this);
3327 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3328 if (vlen <= 4) {
3329 __ movq($dst$$XMMRegister, const_addr);
3330 if (vlen == 4) {
3331 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3332 }
3333 } else {
3334 assert(VM_Version::supports_avx2(), "sanity");
3335 int vector_len = vector_length_encoding(this);
3336 __ movq($dst$$XMMRegister, const_addr);
3337 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3338 }
3339 %}
3340 ins_pipe( pipe_slow );
3341 %}
3342
3343 // Replicate integer (4 byte) scalar zero to be vector
3344 instruct ReplI_zero(vec dst, immI0 zero) %{
3345 match(Set dst (ReplicateI zero));
3346 format %{ "replicateI $dst,$zero" %}
3347 ins_encode %{
3348 uint vlen = vector_length(this);
3349 if (vlen <= 4) {
3350 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3351 } else {
3352 int vlen_enc = vector_length_encoding(this);
3353 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3354 }
3355 %}
3356 ins_pipe( fpu_reg_reg );
3357 %}
3358
3359 // ====================ReplicateL=======================================
3360
3361 #ifdef _LP64
3362 // Replicate long (8 byte) scalar to be vector
3363 instruct ReplL_reg(vec dst, rRegL src) %{
3364 match(Set dst (ReplicateL src));
3366 ins_encode %{
3367 uint vlen = vector_length(this);
3368 if (vlen == 2) {
3369 __ movdq($dst$$XMMRegister, $src$$Register);
3370 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3371 } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3372 int vlen_enc = vector_length_encoding(this);
3373 __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3374 } else {
3375 assert(vlen == 4, "sanity");
3376 __ movdq($dst$$XMMRegister, $src$$Register);
3377 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3378 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3379 }
3380 %}
3381 ins_pipe( pipe_slow );
3382 %}
3383 #else // _LP64
3384 // Replicate long (8 byte) scalar to be vector
3385 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3386 predicate(n->as_Vector()->length() <= 4);
3387 match(Set dst (ReplicateL src));
3388 effect(TEMP dst, USE src, TEMP tmp);
3389 format %{ "replicateL $dst,$src" %}
3390 ins_encode %{
3391 uint vlen = vector_length(this);
3392 if (vlen == 2) {
3393 __ movdl($dst$$XMMRegister, $src$$Register);
3394 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3395 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3396 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3397 } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3398 int vector_len = Assembler::AVX_256bit;
3399 __ movdl($dst$$XMMRegister, $src$$Register);
3400 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3401 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3402 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3403 } else {
3404 __ movdl($dst$$XMMRegister, $src$$Register);
3405 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3406 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3407 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3408 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3409 }
3410 %}
3411 ins_pipe( pipe_slow );
3412 %}
3413
3414 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3415 predicate(n->as_Vector()->length() == 8);
3416 match(Set dst (ReplicateL src));
3417 effect(TEMP dst, USE src, TEMP tmp);
3418 format %{ "replicateL $dst,$src" %}
3419 ins_encode %{
3420 if (VM_Version::supports_avx512vl()) {
3421 __ movdl($dst$$XMMRegister, $src$$Register);
3422 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3423 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3424 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3425 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3426 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3427 } else {
3428 int vector_len = Assembler::AVX_512bit;
3429 __ movdl($dst$$XMMRegister, $src$$Register);
3430 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3431 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3432 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3433 }
3434 %}
3435 ins_pipe( pipe_slow );
3436 %}
3437 #endif // _LP64
3438
3439 instruct ReplL_mem(vec dst, memory mem) %{
3440 match(Set dst (ReplicateL (LoadL mem)));
3441 format %{ "replicateL $dst,$mem" %}
3442 ins_encode %{
3443 uint vlen = vector_length(this);
3444 if (vlen == 2) {
3445 __ movq($dst$$XMMRegister, $mem$$Address);
3446 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3447 } else {
3448 assert(VM_Version::supports_avx2(), "sanity");
3449 int vlen_enc = vector_length_encoding(this);
3450 __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
3451 }
3452 %}
3481 if (vlen == 2) {
3482 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3483 } else {
3484 int vlen_enc = vector_length_encoding(this);
3485 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3486 }
3487 %}
3488 ins_pipe( fpu_reg_reg );
3489 %}
3490
3491 // ====================ReplicateF=======================================
3492
3493 instruct ReplF_reg(vec dst, vlRegF src) %{
3494 match(Set dst (ReplicateF src));
3495 format %{ "replicateF $dst,$src" %}
3496 ins_encode %{
3497 uint vlen = vector_length(this);
3498 if (vlen <= 4) {
3499 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3500 } else if (VM_Version::supports_avx2()) {
3501 int vector_len = vector_length_encoding(this);
3502 __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len); // reg-to-reg variant requires AVX2
3503 } else {
3504 assert(vlen == 8, "sanity");
3505 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3506 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3507 }
3508 %}
3509 ins_pipe( pipe_slow );
3510 %}
3511
3512 instruct ReplF_mem(vec dst, memory mem) %{
3513 match(Set dst (ReplicateF (LoadF mem)));
3514 format %{ "replicateF $dst,$mem" %}
3515 ins_encode %{
3516 uint vlen = vector_length(this);
3517 if (vlen <= 4) {
3518 __ movdl($dst$$XMMRegister, $mem$$Address);
3519 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3520 } else {
3521 assert(VM_Version::supports_avx(), "sanity");
3522 int vector_len = vector_length_encoding(this);
3523 __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
3524 }
3525 %}
3526 ins_pipe( pipe_slow );
3527 %}
3528
3529 instruct ReplF_zero(vec dst, immF0 zero) %{
3530 match(Set dst (ReplicateF zero));
3531 format %{ "replicateF $dst,$zero" %}
3532 ins_encode %{
3533 uint vlen = vector_length(this);
3534 if (vlen <= 4) {
3535 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3536 } else {
3537 int vlen_enc = vector_length_encoding(this);
3538 __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
3539 }
3540 %}
3541 ins_pipe( fpu_reg_reg );
3542 %}
3543
3544 // ====================ReplicateD=======================================
3545
3546 // Replicate double (8 bytes) scalar to be vector
3547 instruct ReplD_reg(vec dst, vlRegD src) %{
3548 match(Set dst (ReplicateD src));
3549 format %{ "replicateD $dst,$src" %}
3550 ins_encode %{
3551 uint vlen = vector_length(this);
3552 if (vlen == 2) {
3553 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3554 } else if (VM_Version::supports_avx2()) {
3555 int vector_len = vector_length_encoding(this);
3556 __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); // reg-to-reg variant requires AVX2
3557 } else {
3558 assert(vlen == 4, "sanity");
3559 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3560 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3561 }
3562 %}
3563 ins_pipe( pipe_slow );
3564 %}
3565
3566 instruct ReplD_mem(vec dst, memory mem) %{
3567 match(Set dst (ReplicateD (LoadD mem)));
3568 format %{ "replicateD $dst,$mem" %}
3569 ins_encode %{
3570 uint vlen = vector_length(this);
3571 if (vlen == 2) {
3572 __ movq($dst$$XMMRegister, $mem$$Address);
3573 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
3574 } else {
3575 assert(VM_Version::supports_avx(), "sanity");
3576 int vector_len = vector_length_encoding(this);
3577 __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
3578 }
3579 %}
3580 ins_pipe( pipe_slow );
3581 %}
3582
3583 instruct ReplD_zero(vec dst, immD0 zero) %{
3584 match(Set dst (ReplicateD zero));
3585 format %{ "replicateD $dst,$zero" %}
3586 ins_encode %{
3587 uint vlen = vector_length(this);
3588 if (vlen == 2) {
3589 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3590 } else {
3591 int vlen_enc = vector_length_encoding(this);
3592 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
3593 }
3594 %}
3595 ins_pipe( fpu_reg_reg );
3596 %}
3597
3598 // ====================REDUCTION ARITHMETIC=======================================
3599 // =======================Int Reduction==========================================
3600
3601 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
3602 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
3603 n->in(2)->bottom_type()->is_vect()->length() < 16);
3604 match(Set dst (AddReductionVI src1 src2));
3605 match(Set dst (MulReductionVI src1 src2));
3606 match(Set dst (AndReductionV src1 src2));
3607 match(Set dst ( OrReductionV src1 src2));
3608 match(Set dst (XorReductionV src1 src2));
3609 effect(TEMP vtmp1, TEMP vtmp2);
3610 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3611 ins_encode %{
3612 int opcode = this->ideal_Opcode();
3613 int vlen = vector_length(this, $src2);
3614 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3615 %}
3616 ins_pipe( pipe_slow );
3617 %}
3618
3619 instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
3620 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
3621 n->in(2)->bottom_type()->is_vect()->length() == 16);
3622 match(Set dst (AddReductionVI src1 src2));
3623 match(Set dst (MulReductionVI src1 src2));
3624 match(Set dst (AndReductionV src1 src2));
3625 match(Set dst ( OrReductionV src1 src2));
3626 match(Set dst (XorReductionV src1 src2));
3627 effect(TEMP vtmp1, TEMP vtmp2);
3628 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3629 ins_encode %{
3630 int opcode = this->ideal_Opcode();
3631 int vlen = vector_length(this, $src2);
3632 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3633 %}
3634 ins_pipe( pipe_slow );
3635 %}
3636
3637 // =======================Long Reduction==========================================
3638
3639 #ifdef _LP64
3640 instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
3641 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
3642 n->in(2)->bottom_type()->is_vect()->length() < 8);
3643 match(Set dst (AddReductionVL src1 src2));
3644 match(Set dst (MulReductionVL src1 src2));
3645 match(Set dst (AndReductionV src1 src2));
3646 match(Set dst ( OrReductionV src1 src2));
3647 match(Set dst (XorReductionV src1 src2));
3648 effect(TEMP vtmp1, TEMP vtmp2);
3649 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3650 ins_encode %{
3651 int opcode = this->ideal_Opcode();
3652 int vlen = vector_length(this, $src2);
3653 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3654 %}
3655 ins_pipe( pipe_slow );
3656 %}
3657
3658 instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
3659 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
3660 n->in(2)->bottom_type()->is_vect()->length() == 8);
3661 match(Set dst (AddReductionVL src1 src2));
3662 match(Set dst (MulReductionVL src1 src2));
3663 match(Set dst (AndReductionV src1 src2));
3664 match(Set dst ( OrReductionV src1 src2));
3665 match(Set dst (XorReductionV src1 src2));
3666 effect(TEMP vtmp1, TEMP vtmp2);
3667 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3668 ins_encode %{
3669 int opcode = this->ideal_Opcode();
3670 int vlen = vector_length(this, $src2);
3671 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3672 %}
3673 ins_pipe( pipe_slow );
3674 %}
3675 #endif // _LP64
3676
3677 // =======================Float Reduction==========================================
3678
3679 instruct reductionF128(regF dst, vec src, vec vtmp) %{
3680 predicate(n->in(2)->bottom_type()->is_vect()->length() <= 4);
3681 match(Set dst (AddReductionVF dst src));
3682 match(Set dst (MulReductionVF dst src));
3683 effect(TEMP dst, TEMP vtmp);
3684 format %{ "vector_reduction_fp $dst,$src ; using $vtmp as TEMP" %}
3685 ins_encode %{
3686 int opcode = this->ideal_Opcode();
3687 int vlen = vector_length(this, $src);
3688 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
3689 %}
3690 ins_pipe( pipe_slow );
3691 %}
3692
3693 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
3694 predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
3695 match(Set dst (AddReductionVF dst src));
3696 match(Set dst (MulReductionVF dst src));
3697 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3698 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3699 ins_encode %{
3700 int opcode = this->ideal_Opcode();
3701 int vlen = vector_length(this, $src);
3702 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3703 %}
3704 ins_pipe( pipe_slow );
3705 %}
3706
3707 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
3708 predicate(n->in(2)->bottom_type()->is_vect()->length() == 16);
3709 match(Set dst (AddReductionVF dst src));
3710 match(Set dst (MulReductionVF dst src));
3711 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3712 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3713 ins_encode %{
3714 int opcode = this->ideal_Opcode();
3715 int vlen = vector_length(this, $src);
3716 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3717 %}
3718 ins_pipe( pipe_slow );
3719 %}
3720
3721 // =======================Double Reduction==========================================
3722
3723 instruct reduction2D(regD dst, vec src, vec vtmp) %{
3724 predicate(n->in(2)->bottom_type()->is_vect()->length() == 2);
3725 match(Set dst (AddReductionVD dst src));
3726 match(Set dst (MulReductionVD dst src));
3727 effect(TEMP dst, TEMP vtmp);
3728 format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
3729 ins_encode %{
3730 int opcode = this->ideal_Opcode();
3731 int vlen = vector_length(this, $src);
3732 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
3733 %}
3734 ins_pipe( pipe_slow );
3735 %}
3736
3737 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
3738 predicate(n->in(2)->bottom_type()->is_vect()->length() == 4);
3739 match(Set dst (AddReductionVD dst src));
3740 match(Set dst (MulReductionVD dst src));
3741 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3742 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3743 ins_encode %{
3744 int opcode = this->ideal_Opcode();
3745 int vlen = vector_length(this, $src);
3746 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3747 %}
3748 ins_pipe( pipe_slow );
3749 %}
3750
3751 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
3752 predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
3753 match(Set dst (AddReductionVD dst src));
3754 match(Set dst (MulReductionVD dst src));
3755 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3756 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3757 ins_encode %{
3758 int opcode = this->ideal_Opcode();
3759 int vlen = vector_length(this, $src);
3760 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3761 %}
3762 ins_pipe( pipe_slow );
3763 %}
3764
3765 // ====================VECTOR ARITHMETIC=======================================
3766
3767 // --------------------------------- ADD --------------------------------------
3768
3769 // Bytes vector add
3770 instruct vaddB(vec dst, vec src) %{
3771 predicate(UseAVX == 0);
3772 match(Set dst (AddVB dst src));
3773 format %{ "paddb $dst,$src\t! add packedB" %}
3774 ins_encode %{
3775 __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3776 %}
3777 ins_pipe( pipe_slow );
3778 %}
3779
3780 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
3781 predicate(UseAVX > 0);
3782 match(Set dst (AddVB src1 src2));
3783 format %{ "vpaddb $dst,$src1,$src2\t! add packedB" %}
3784 ins_encode %{
3785 int vector_len = vector_length_encoding(this);
3786 __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3787 %}
3788 ins_pipe( pipe_slow );
3789 %}
3790
3791 instruct vaddB_mem(vec dst, vec src, memory mem) %{
3792 predicate(UseAVX > 0);
3793 match(Set dst (AddVB src (LoadVector mem)));
3794 format %{ "vpaddb $dst,$src,$mem\t! add packedB" %}
3795 ins_encode %{
3796 int vector_len = vector_length_encoding(this);
3797 __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3798 %}
3799 ins_pipe( pipe_slow );
3800 %}
3801
3802 // Shorts/Chars vector add
3803 instruct vaddS(vec dst, vec src) %{
3804 predicate(UseAVX == 0);
3805 match(Set dst (AddVS dst src));
3806 format %{ "paddw $dst,$src\t! add packedS" %}
3807 ins_encode %{
3808 __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3809 %}
3810 ins_pipe( pipe_slow );
3811 %}
3812
3813 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
3814 predicate(UseAVX > 0);
3815 match(Set dst (AddVS src1 src2));
3816 format %{ "vpaddw $dst,$src1,$src2\t! add packedS" %}
3817 ins_encode %{
3818 int vector_len = vector_length_encoding(this);
3819 __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3820 %}
3821 ins_pipe( pipe_slow );
3822 %}
3823
3824 instruct vaddS_mem(vec dst, vec src, memory mem) %{
3825 predicate(UseAVX > 0);
3826 match(Set dst (AddVS src (LoadVector mem)));
3827 format %{ "vpaddw $dst,$src,$mem\t! add packedS" %}
3828 ins_encode %{
3829 int vector_len = vector_length_encoding(this);
3830 __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3831 %}
3832 ins_pipe( pipe_slow );
3833 %}
3834
3835 // Integers vector add
3836 instruct vaddI(vec dst, vec src) %{
3837 predicate(UseAVX == 0);
3838 match(Set dst (AddVI dst src));
3839 format %{ "paddd $dst,$src\t! add packedI" %}
3840 ins_encode %{
3841 __ paddd($dst$$XMMRegister, $src$$XMMRegister);
3842 %}
3843 ins_pipe( pipe_slow );
3844 %}
3845
3846 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
3847 predicate(UseAVX > 0);
3848 match(Set dst (AddVI src1 src2));
3849 format %{ "vpaddd $dst,$src1,$src2\t! add packedI" %}
3850 ins_encode %{
3851 int vector_len = vector_length_encoding(this);
3852 __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3853 %}
3854 ins_pipe( pipe_slow );
3855 %}
3856
3857
3858 instruct vaddI_mem(vec dst, vec src, memory mem) %{
3859 predicate(UseAVX > 0);
3860 match(Set dst (AddVI src (LoadVector mem)));
3861 format %{ "vpaddd $dst,$src,$mem\t! add packedI" %}
3862 ins_encode %{
3863 int vector_len = vector_length_encoding(this);
3864 __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3865 %}
3866 ins_pipe( pipe_slow );
3867 %}
3868
3869 // Longs vector add
3870 instruct vaddL(vec dst, vec src) %{
3871 predicate(UseAVX == 0);
3872 match(Set dst (AddVL dst src));
3873 format %{ "paddq $dst,$src\t! add packedL" %}
3874 ins_encode %{
3875 __ paddq($dst$$XMMRegister, $src$$XMMRegister);
3876 %}
3877 ins_pipe( pipe_slow );
3878 %}
3879
3880 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
3881 predicate(UseAVX > 0);
3882 match(Set dst (AddVL src1 src2));
3883 format %{ "vpaddq $dst,$src1,$src2\t! add packedL" %}
3884 ins_encode %{
3885 int vector_len = vector_length_encoding(this);
3886 __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3887 %}
3888 ins_pipe( pipe_slow );
3889 %}
3890
3891 instruct vaddL_mem(vec dst, vec src, memory mem) %{
3892 predicate(UseAVX > 0);
3893 match(Set dst (AddVL src (LoadVector mem)));
3894 format %{ "vpaddq $dst,$src,$mem\t! add packedL" %}
3895 ins_encode %{
3896 int vector_len = vector_length_encoding(this);
3897 __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3898 %}
3899 ins_pipe( pipe_slow );
3900 %}
3901
3902 // Floats vector add
3903 instruct vaddF(vec dst, vec src) %{
3904 predicate(UseAVX == 0);
3905 match(Set dst (AddVF dst src));
3906 format %{ "addps $dst,$src\t! add packedF" %}
3907 ins_encode %{
3908 __ addps($dst$$XMMRegister, $src$$XMMRegister);
3909 %}
3910 ins_pipe( pipe_slow );
3911 %}
3912
3913 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
3914 predicate(UseAVX > 0);
3915 match(Set dst (AddVF src1 src2));
3916 format %{ "vaddps $dst,$src1,$src2\t! add packedF" %}
3917 ins_encode %{
3918 int vector_len = vector_length_encoding(this);
3919 __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3920 %}
3921 ins_pipe( pipe_slow );
3922 %}
3923
3924 instruct vaddF_mem(vec dst, vec src, memory mem) %{
3925 predicate(UseAVX > 0);
3926 match(Set dst (AddVF src (LoadVector mem)));
3927 format %{ "vaddps $dst,$src,$mem\t! add packedF" %}
3928 ins_encode %{
3929 int vector_len = vector_length_encoding(this);
3930 __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3931 %}
3932 ins_pipe( pipe_slow );
3933 %}
3934
3935 // Doubles vector add
3936 instruct vaddD(vec dst, vec src) %{
3937 predicate(UseAVX == 0);
3938 match(Set dst (AddVD dst src));
3939 format %{ "addpd $dst,$src\t! add packedD" %}
3940 ins_encode %{
3941 __ addpd($dst$$XMMRegister, $src$$XMMRegister);
3942 %}
3943 ins_pipe( pipe_slow );
3944 %}
3945
3946 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
3947 predicate(UseAVX > 0);
3948 match(Set dst (AddVD src1 src2));
3949 format %{ "vaddpd $dst,$src1,$src2\t! add packedD" %}
3950 ins_encode %{
3951 int vector_len = vector_length_encoding(this);
3952 __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3953 %}
3954 ins_pipe( pipe_slow );
3955 %}
3956
3957 instruct vaddD_mem(vec dst, vec src, memory mem) %{
3958 predicate(UseAVX > 0);
3959 match(Set dst (AddVD src (LoadVector mem)));
3960 format %{ "vaddpd $dst,$src,$mem\t! add packedD" %}
3961 ins_encode %{
3962 int vector_len = vector_length_encoding(this);
3963 __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3964 %}
3965 ins_pipe( pipe_slow );
3966 %}
3967
3968 // --------------------------------- SUB --------------------------------------
3969
3970 // Bytes vector sub
3971 instruct vsubB(vec dst, vec src) %{
3972 predicate(UseAVX == 0);
3973 match(Set dst (SubVB dst src));
3974 format %{ "psubb $dst,$src\t! sub packedB" %}
3975 ins_encode %{
3976 __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3977 %}
3978 ins_pipe( pipe_slow );
3979 %}
3980
3981 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
3982 predicate(UseAVX > 0);
3983 match(Set dst (SubVB src1 src2));
3984 format %{ "vpsubb $dst,$src1,$src2\t! sub packedB" %}
3985 ins_encode %{
3986 int vector_len = vector_length_encoding(this);
3987 __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3988 %}
3989 ins_pipe( pipe_slow );
3990 %}
3991
3992 instruct vsubB_mem(vec dst, vec src, memory mem) %{
3993 predicate(UseAVX > 0);
3994 match(Set dst (SubVB src (LoadVector mem)));
3995 format %{ "vpsubb $dst,$src,$mem\t! sub packedB" %}
3996 ins_encode %{
3997 int vector_len = vector_length_encoding(this);
3998 __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3999 %}
4000 ins_pipe( pipe_slow );
4001 %}
4002
4003 // Shorts/Chars vector sub
4004 instruct vsubS(vec dst, vec src) %{
4005 predicate(UseAVX == 0);
4006 match(Set dst (SubVS dst src));
4007 format %{ "psubw $dst,$src\t! sub packedS" %}
4008 ins_encode %{
4009 __ psubw($dst$$XMMRegister, $src$$XMMRegister);
4010 %}
4011 ins_pipe( pipe_slow );
4012 %}
4013
4014
4015 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
4016 predicate(UseAVX > 0);
4017 match(Set dst (SubVS src1 src2));
4018 format %{ "vpsubw $dst,$src1,$src2\t! sub packedS" %}
4019 ins_encode %{
4020 int vector_len = vector_length_encoding(this);
4021 __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4022 %}
4023 ins_pipe( pipe_slow );
4024 %}
4025
4026 instruct vsubS_mem(vec dst, vec src, memory mem) %{
4027 predicate(UseAVX > 0);
4028 match(Set dst (SubVS src (LoadVector mem)));
4029 format %{ "vpsubw $dst,$src,$mem\t! sub packedS" %}
4030 ins_encode %{
4031 int vector_len = vector_length_encoding(this);
4032 __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4033 %}
4034 ins_pipe( pipe_slow );
4035 %}
4036
4037 // Integers vector sub
4038 instruct vsubI(vec dst, vec src) %{
4039 predicate(UseAVX == 0);
4040 match(Set dst (SubVI dst src));
4041 format %{ "psubd $dst,$src\t! sub packedI" %}
4042 ins_encode %{
4043 __ psubd($dst$$XMMRegister, $src$$XMMRegister);
4044 %}
4045 ins_pipe( pipe_slow );
4046 %}
4047
4048 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
4049 predicate(UseAVX > 0);
4050 match(Set dst (SubVI src1 src2));
4051 format %{ "vpsubd $dst,$src1,$src2\t! sub packedI" %}
4052 ins_encode %{
4053 int vector_len = vector_length_encoding(this);
4054 __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4055 %}
4056 ins_pipe( pipe_slow );
4057 %}
4058
4059 instruct vsubI_mem(vec dst, vec src, memory mem) %{
4060 predicate(UseAVX > 0);
4061 match(Set dst (SubVI src (LoadVector mem)));
4062 format %{ "vpsubd $dst,$src,$mem\t! sub packedI" %}
4063 ins_encode %{
4064 int vector_len = vector_length_encoding(this);
4065 __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4066 %}
4067 ins_pipe( pipe_slow );
4068 %}
4069
4070 // Longs vector sub
4071 instruct vsubL(vec dst, vec src) %{
4072 predicate(UseAVX == 0);
4073 match(Set dst (SubVL dst src));
4074 format %{ "psubq $dst,$src\t! sub packedL" %}
4075 ins_encode %{
4076 __ psubq($dst$$XMMRegister, $src$$XMMRegister);
4077 %}
4078 ins_pipe( pipe_slow );
4079 %}
4080
4081 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
4082 predicate(UseAVX > 0);
4083 match(Set dst (SubVL src1 src2));
4084 format %{ "vpsubq $dst,$src1,$src2\t! sub packedL" %}
4085 ins_encode %{
4086 int vector_len = vector_length_encoding(this);
4087 __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4088 %}
4089 ins_pipe( pipe_slow );
4090 %}
4091
4092
4093 instruct vsubL_mem(vec dst, vec src, memory mem) %{
4094 predicate(UseAVX > 0);
4095 match(Set dst (SubVL src (LoadVector mem)));
4096 format %{ "vpsubq $dst,$src,$mem\t! sub packedL" %}
4097 ins_encode %{
4098 int vector_len = vector_length_encoding(this);
4099 __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4100 %}
4101 ins_pipe( pipe_slow );
4102 %}
4103
4104 // Floats vector sub
4105 instruct vsubF(vec dst, vec src) %{
4106 predicate(UseAVX == 0);
4107 match(Set dst (SubVF dst src));
4108 format %{ "subps $dst,$src\t! sub packedF" %}
4109 ins_encode %{
4110 __ subps($dst$$XMMRegister, $src$$XMMRegister);
4111 %}
4112 ins_pipe( pipe_slow );
4113 %}
4114
4115 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
4116 predicate(UseAVX > 0);
4117 match(Set dst (SubVF src1 src2));
4118 format %{ "vsubps $dst,$src1,$src2\t! sub packedF" %}
4119 ins_encode %{
4120 int vector_len = vector_length_encoding(this);
4121 __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4122 %}
4123 ins_pipe( pipe_slow );
4124 %}
4125
4126 instruct vsubF_mem(vec dst, vec src, memory mem) %{
4127 predicate(UseAVX > 0);
4128 match(Set dst (SubVF src (LoadVector mem)));
4129 format %{ "vsubps $dst,$src,$mem\t! sub packedF" %}
4130 ins_encode %{
4131 int vector_len = vector_length_encoding(this);
4132 __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4133 %}
4134 ins_pipe( pipe_slow );
4135 %}
4136
4137 // Doubles vector sub
4138 instruct vsubD(vec dst, vec src) %{
4139 predicate(UseAVX == 0);
4140 match(Set dst (SubVD dst src));
4141 format %{ "subpd $dst,$src\t! sub packedD" %}
4142 ins_encode %{
4143 __ subpd($dst$$XMMRegister, $src$$XMMRegister);
4144 %}
4145 ins_pipe( pipe_slow );
4146 %}
4147
4148 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
4149 predicate(UseAVX > 0);
4150 match(Set dst (SubVD src1 src2));
4151 format %{ "vsubpd $dst,$src1,$src2\t! sub packedD" %}
4152 ins_encode %{
4153 int vector_len = vector_length_encoding(this);
4154 __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4155 %}
4156 ins_pipe( pipe_slow );
4157 %}
4158
4159 instruct vsubD_mem(vec dst, vec src, memory mem) %{
4160 predicate(UseAVX > 0);
4161 match(Set dst (SubVD src (LoadVector mem)));
4162 format %{ "vsubpd $dst,$src,$mem\t! sub packedD" %}
4163 ins_encode %{
4164 int vector_len = vector_length_encoding(this);
4165 __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4166 %}
4167 ins_pipe( pipe_slow );
4168 %}
4169
4170 // --------------------------------- MUL --------------------------------------
4171
4172 // Byte vector mul
4173 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
4174 predicate(n->as_Vector()->length() == 4 ||
4175 n->as_Vector()->length() == 8);
4176 match(Set dst (MulVB src1 src2));
4177 effect(TEMP dst, TEMP tmp, TEMP scratch);
4178 format %{"vector_mulB $dst,$src1,$src2" %}
4179 ins_encode %{
4180 assert(UseSSE > 3, "required");
4181 __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
4182 __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
4183 __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
4184 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4185 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
4186 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
4187 %}
4188 ins_pipe( pipe_slow );
4189 %}
4190
4191 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
4192 predicate(n->as_Vector()->length() == 16 && UseAVX <= 1);
4193 match(Set dst (MulVB src1 src2));
4194 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4195 format %{"vector_mulB $dst,$src1,$src2" %}
4196 ins_encode %{
4197 assert(UseSSE > 3, "required");
4198 __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
4199 __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
4200 __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
4201 __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
4202 __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
4203 __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4204 __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
4205 __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
4206 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4207 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
4208 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
4209 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
4210 %}
4211 ins_pipe( pipe_slow );
4212 %}
4213
4214 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
4215 predicate(n->as_Vector()->length() == 16 && UseAVX > 1);
4216 match(Set dst (MulVB src1 src2));
4217 effect(TEMP dst, TEMP tmp, TEMP scratch);
4218 format %{"vector_mulB $dst,$src1,$src2" %}
4219 ins_encode %{
4220 int vector_len = Assembler::AVX_256bit;
4221 __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
4222 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
4223 __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
4224 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4225 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
4226 __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
4227 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
4228 %}
4229 ins_pipe( pipe_slow );
4230 %}
4231
4232 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
4233 predicate(n->as_Vector()->length() == 32);
4234 match(Set dst (MulVB src1 src2));
4235 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4236 format %{"vector_mulB $dst,$src1,$src2" %}
4237 ins_encode %{
4238 assert(UseAVX > 1, "required");
4239 int vector_len = Assembler::AVX_256bit;
4240 __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
4241 __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
4242 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4243 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4244 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4245 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
4246 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
4247 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4248 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4249 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4250 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4251 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4252 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4253 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
4254 %}
4255 ins_pipe( pipe_slow );
4256 %}
4257
4258 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
4259 predicate(n->as_Vector()->length() == 64);
4260 match(Set dst (MulVB src1 src2));
4261 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4262 format %{"vector_mulB $dst,$src1,$src2\n\t" %}
4263 ins_encode %{
4264 assert(UseAVX > 2, "required");
4265 int vector_len = Assembler::AVX_512bit;
4266 __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
4267 __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
4268 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4269 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4270 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4271 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
4272 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
4273 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4274 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4275 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4276 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4277 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4278 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4279 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
4280 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4281 %}
4282 ins_pipe( pipe_slow );
4283 %}
4284
4285 // Shorts/Chars vector mul
4286 instruct vmulS(vec dst, vec src) %{
4287 predicate(UseAVX == 0);
4288 match(Set dst (MulVS dst src));
4289 format %{ "pmullw $dst,$src\t! mul packedS" %}
4290 ins_encode %{
4291 __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4292 %}
4293 ins_pipe( pipe_slow );
4294 %}
4295
4296 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
4297 predicate(UseAVX > 0);
4298 match(Set dst (MulVS src1 src2));
4299 format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
4300 ins_encode %{
4301 int vector_len = vector_length_encoding(this);
4302 __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4303 %}
4304 ins_pipe( pipe_slow );
4305 %}
4306
4307 instruct vmulS_mem(vec dst, vec src, memory mem) %{
4308 predicate(UseAVX > 0);
4309 match(Set dst (MulVS src (LoadVector mem)));
4310 format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
4311 ins_encode %{
4312 int vector_len = vector_length_encoding(this);
4313 __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4314 %}
4315 ins_pipe( pipe_slow );
4316 %}
4317
4318 // Integers vector mul
4319 instruct vmulI(vec dst, vec src) %{
4320 predicate(UseAVX == 0);
4321 match(Set dst (MulVI dst src));
4322 format %{ "pmulld $dst,$src\t! mul packedI" %}
4323 ins_encode %{
4324 assert(UseSSE > 3, "required");
4325 __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
4326 %}
4327 ins_pipe( pipe_slow );
4328 %}
4329
4330 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
4331 predicate(UseAVX > 0);
4332 match(Set dst (MulVI src1 src2));
4333 format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
4334 ins_encode %{
4335 int vector_len = vector_length_encoding(this);
4336 __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4337 %}
4338 ins_pipe( pipe_slow );
4339 %}
4340
4341 instruct vmulI_mem(vec dst, vec src, memory mem) %{
4342 predicate(UseAVX > 0);
4343 match(Set dst (MulVI src (LoadVector mem)));
4344 format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
4345 ins_encode %{
4346 int vector_len = vector_length_encoding(this);
4347 __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4348 %}
4349 ins_pipe( pipe_slow );
4350 %}
4351
4352 // Longs vector mul
4353 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
4354 match(Set dst (MulVL src1 src2));
4355 format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
4356 ins_encode %{
4357 assert(UseAVX > 2, "required");
4358 int vector_len = vector_length_encoding(this);
4359 __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4360 %}
4361 ins_pipe( pipe_slow );
4362 %}
4363
4364 instruct vmulL_mem(vec dst, vec src, memory mem) %{
4365 match(Set dst (MulVL src (LoadVector mem)));
4366 format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
4367 ins_encode %{
4368 assert(UseAVX > 2, "required");
4369 int vector_len = vector_length_encoding(this);
4370 __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4371 %}
4372 ins_pipe( pipe_slow );
4373 %}
4374
4375 // Floats vector mul
4376 instruct vmulF(vec dst, vec src) %{
4377 predicate(UseAVX == 0);
4378 match(Set dst (MulVF dst src));
4379 format %{ "mulps $dst,$src\t! mul packedF" %}
4380 ins_encode %{
4381 __ mulps($dst$$XMMRegister, $src$$XMMRegister);
4382 %}
4383 ins_pipe( pipe_slow );
4384 %}
4385
4386 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
4387 predicate(UseAVX > 0);
4388 match(Set dst (MulVF src1 src2));
4389 format %{ "vmulps $dst,$src1,$src2\t! mul packedF" %}
4390 ins_encode %{
4391 int vector_len = vector_length_encoding(this);
4392 __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4393 %}
4394 ins_pipe( pipe_slow );
4395 %}
4396
4397 instruct vmulF_mem(vec dst, vec src, memory mem) %{
4398 predicate(UseAVX > 0);
4399 match(Set dst (MulVF src (LoadVector mem)));
4400 format %{ "vmulps $dst,$src,$mem\t! mul packedF" %}
4401 ins_encode %{
4402 int vector_len = vector_length_encoding(this);
4403 __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4404 %}
4405 ins_pipe( pipe_slow );
4406 %}
4407
4408 // Doubles vector mul
4409 instruct vmulD(vec dst, vec src) %{
4410 predicate(UseAVX == 0);
4411 match(Set dst (MulVD dst src));
4412 format %{ "mulpd $dst,$src\t! mul packedD" %}
4413 ins_encode %{
4414 __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
4415 %}
4416 ins_pipe( pipe_slow );
4417 %}
4418
4419 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
4420 predicate(UseAVX > 0);
4421 match(Set dst (MulVD src1 src2));
4422 format %{ "vmulpd $dst,$src1,$src2\t! mul packedD" %}
4423 ins_encode %{
4424 int vector_len = vector_length_encoding(this);
4425 __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4426 %}
4427 ins_pipe( pipe_slow );
4428 %}
4429
4430 instruct vmulD_mem(vec dst, vec src, memory mem) %{
4431 predicate(UseAVX > 0);
4432 match(Set dst (MulVD src (LoadVector mem)));
4433 format %{ "vmulpd $dst,$src,$mem\t! mul packedD" %}
4434 ins_encode %{
4435 int vector_len = vector_length_encoding(this);
4436 __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4437 %}
4438 ins_pipe( pipe_slow );
4439 %}
4440
4441 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
4442 predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4443 match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
4444 effect(TEMP dst, USE src1, USE src2);
4445 format %{ "cmpps.$copnd $dst, $src1, $src2 ! vcmovevf, cond=$cop\n\t"
4446 "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
4447 %}
4448 ins_encode %{
4449 int vector_len = 1;
4450 int cond = (Assembler::Condition)($copnd$$cmpcode);
4451 __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
4452 __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
4453 %}
4454 ins_pipe( pipe_slow );
4455 %}
4456
4457 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
4458 predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4459 match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
4460 effect(TEMP dst, USE src1, USE src2);
4461 format %{ "cmppd.$copnd $dst, $src1, $src2 ! vcmovevd, cond=$cop\n\t"
4462 "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
4463 %}
4464 ins_encode %{
4465 int vector_len = 1;
4466 int cond = (Assembler::Condition)($copnd$$cmpcode);
4467 __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
4468 __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
4469 %}
4470 ins_pipe( pipe_slow );
4471 %}
4472
4473 // --------------------------------- DIV --------------------------------------
4474
4475 // Floats vector div
4476 instruct vdivF(vec dst, vec src) %{
4477 predicate(UseAVX == 0);
4478 match(Set dst (DivVF dst src));
4479 format %{ "divps $dst,$src\t! div packedF" %}
4480 ins_encode %{
4481 __ divps($dst$$XMMRegister, $src$$XMMRegister);
4482 %}
4483 ins_pipe( pipe_slow );
4484 %}
4485
4486 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
4487 predicate(UseAVX > 0);
4488 match(Set dst (DivVF src1 src2));
4489 format %{ "vdivps $dst,$src1,$src2\t! div packedF" %}
4490 ins_encode %{
4491 int vector_len = vector_length_encoding(this);
4492 __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4493 %}
4494 ins_pipe( pipe_slow );
4495 %}
4496
4497 instruct vdivF_mem(vec dst, vec src, memory mem) %{
4498 predicate(UseAVX > 0);
4499 match(Set dst (DivVF src (LoadVector mem)));
4500 format %{ "vdivps $dst,$src,$mem\t! div packedF" %}
4501 ins_encode %{
4502 int vector_len = vector_length_encoding(this);
4503 __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4504 %}
4505 ins_pipe( pipe_slow );
4506 %}
4507
4508 // Doubles vector div
4509 instruct vdivD(vec dst, vec src) %{
4510 predicate(UseAVX == 0);
4511 match(Set dst (DivVD dst src));
4512 format %{ "divpd $dst,$src\t! div packedD" %}
4513 ins_encode %{
4514 __ divpd($dst$$XMMRegister, $src$$XMMRegister);
4515 %}
4516 ins_pipe( pipe_slow );
4517 %}
4518
4519 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
4520 predicate(UseAVX > 0);
4521 match(Set dst (DivVD src1 src2));
4522 format %{ "vdivpd $dst,$src1,$src2\t! div packedD" %}
4523 ins_encode %{
4524 int vector_len = vector_length_encoding(this);
4525 __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4526 %}
4527 ins_pipe( pipe_slow );
4528 %}
4529
4530 instruct vdivD_mem(vec dst, vec src, memory mem) %{
4531 predicate(UseAVX > 0);
4532 match(Set dst (DivVD src (LoadVector mem)));
4533 format %{ "vdivpd $dst,$src,$mem\t! div packedD" %}
4534 ins_encode %{
4535 int vector_len = vector_length_encoding(this);
4536 __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4537 %}
4538 ins_pipe( pipe_slow );
4539 %}
4540
4541 // --------------------------------- Sqrt --------------------------------------
4542
4543 instruct vsqrtF_reg(vec dst, vec src) %{
4544 match(Set dst (SqrtVF src));
4545 format %{ "vsqrtps $dst,$src\t! sqrt packedF" %}
4546 ins_encode %{
4547 assert(UseAVX > 0, "required");
4548 int vector_len = vector_length_encoding(this);
4549 __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4550 %}
4551 ins_pipe( pipe_slow );
4552 %}
4553
4554 instruct vsqrtF_mem(vec dst, memory mem) %{
4555 match(Set dst (SqrtVF (LoadVector mem)));
4556 format %{ "vsqrtps $dst,$mem\t! sqrt packedF" %}
4557 ins_encode %{
4558 assert(UseAVX > 0, "required");
4559 int vector_len = vector_length_encoding(this);
4560 __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
4561 %}
4562 ins_pipe( pipe_slow );
4563 %}
4564
4565 // Floating point vector sqrt
4566 instruct vsqrtD_reg(vec dst, vec src) %{
4567 match(Set dst (SqrtVD src));
4568 format %{ "vsqrtpd $dst,$src\t! sqrt packedD" %}
4569 ins_encode %{
4570 assert(UseAVX > 0, "required");
4571 int vector_len = vector_length_encoding(this);
4572 __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4573 %}
4574 ins_pipe( pipe_slow );
4575 %}
4576
4577 instruct vsqrtD_mem(vec dst, memory mem) %{
4578 match(Set dst (SqrtVD (LoadVector mem)));
4579 format %{ "vsqrtpd $dst,$mem\t! sqrt packedD" %}
4580 ins_encode %{
4581 assert(UseAVX > 0, "required");
4582 int vector_len = vector_length_encoding(this);
4583 __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
4584 %}
4585 ins_pipe( pipe_slow );
4586 %}
4587
4588 // ------------------------------ Shift ---------------------------------------
4589
4590 // Left and right shift count vectors are the same on x86
4591 // (only lowest bits of xmm reg are used for count).
4592 instruct vshiftcnt(vec dst, rRegI cnt) %{
4593 match(Set dst (LShiftCntV cnt));
4594 match(Set dst (RShiftCntV cnt));
4595 format %{ "movdl $dst,$cnt\t! load shift count" %}
4596 ins_encode %{
4597 __ movdl($dst$$XMMRegister, $cnt$$Register);
4598 %}
4599 ins_pipe( pipe_slow );
4600 %}
4601
4602 // Byte vector shift
4603 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4604 predicate(n->as_Vector()->length() <= 8);
4605 match(Set dst (LShiftVB src shift));
4606 match(Set dst (RShiftVB src shift));
4607 match(Set dst (URShiftVB src shift));
4608 effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
4609 format %{"vector_byte_shift $dst,$src,$shift" %}
4610 ins_encode %{
4611 assert(UseSSE > 3, "required");
4612 int opcode = this->ideal_Opcode();
4613 __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
4614 __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
4615 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4616 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
4617 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
4618 %}
4619 ins_pipe( pipe_slow );
4620 %}
4621
4622 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
4623 predicate(n->as_Vector()->length() == 16 && UseAVX <= 1);
4624 match(Set dst (LShiftVB src shift));
4625 match(Set dst (RShiftVB src shift));
4626 match(Set dst (URShiftVB src shift));
4627 effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
4628 format %{"vector_byte_shift $dst,$src,$shift" %}
4629 ins_encode %{
4630 assert(UseSSE > 3, "required");
4631 int opcode = this->ideal_Opcode();
4632
4633 __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister);
4634 __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
4635 __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
4636 __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
4637 __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
4638 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4639 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
4640 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
4641 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
4642 %}
4643 ins_pipe( pipe_slow );
4644 %}
4645
4646 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4647 predicate(n->as_Vector()->length() == 16 && UseAVX > 1);
4648 match(Set dst (LShiftVB src shift));
4649 match(Set dst (RShiftVB src shift));
4650 match(Set dst (URShiftVB src shift));
4651 effect(TEMP dst, TEMP tmp, TEMP scratch);
4652 format %{"vector_byte_shift $dst,$src,$shift" %}
4653 ins_encode %{
4654 int opcode = this->ideal_Opcode();
4655 int vector_len = Assembler::AVX_256bit;
4656 __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
4657 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
4658 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
4659 __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
4660 __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
4661 %}
4662 ins_pipe( pipe_slow );
4663 %}
4664
4665 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4666 predicate(n->as_Vector()->length() == 32);
4667 match(Set dst (LShiftVB src shift));
4668 match(Set dst (RShiftVB src shift));
4669 match(Set dst (URShiftVB src shift));
4670 effect(TEMP dst, TEMP tmp, TEMP scratch);
4671 format %{"vector_byte_shift $dst,$src,$shift" %}
4672 ins_encode %{
4673 assert(UseAVX > 1, "required");
4674 int opcode = this->ideal_Opcode();
4675 int vector_len = Assembler::AVX_256bit;
4676 __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
4677 __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4678 __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
4679 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
4680 __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len);
4681 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
4682 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
4683 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
4684 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
4685 %}
4686 ins_pipe( pipe_slow );
4687 %}
4688
4689 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
4690 predicate(n->as_Vector()->length() == 64);
4691 match(Set dst (LShiftVB src shift));
4692 match(Set dst (RShiftVB src shift));
4693 match(Set dst (URShiftVB src shift));
4694 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4695 format %{"vector_byte_shift $dst,$src,$shift" %}
4696 ins_encode %{
4697 assert(UseAVX > 2, "required");
4698 int opcode = this->ideal_Opcode();
4699 int vector_len = Assembler::AVX_512bit;
4700 __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
4701 __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4702 __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
4703 __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len);
4704 __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
4705 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4706 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4707 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4708 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4709 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4710 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
4711 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4712 %}
4713 ins_pipe( pipe_slow );
4714 %}
4715
4716 // Shorts vector logical right shift produces incorrect Java result
4717 // for negative data because java code convert short value into int with
4718 // sign extension before a shift. But char vectors are fine since chars are
4719 // unsigned values.
4720 // Shorts/Chars vector left shift
4721 instruct vshiftS(vec dst, vec src, vec shift) %{
4722 match(Set dst (LShiftVS src shift));
4723 match(Set dst (RShiftVS src shift));
4724 match(Set dst (URShiftVS src shift));
4725 effect(TEMP dst, USE src, USE shift);
4726 format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %}
4727 ins_encode %{
4728 int opcode = this->ideal_Opcode();
4729 if (UseAVX > 0) {
4730 int vlen_enc = vector_length_encoding(this);
4731 __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
4732 } else {
4733 int vlen = vector_length(this);
4734 if (vlen == 2) {
4735 __ movflt($dst$$XMMRegister, $src$$XMMRegister);
4736 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4737 } else if (vlen == 4) {
4738 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
4739 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4740 } else {
4741 assert (vlen == 8, "sanity");
4742 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4743 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4744 }
4745 }
4746 %}
4747 ins_pipe( pipe_slow );
4748 %}
4749
4750 // Integers vector left shift
4751 instruct vshiftI(vec dst, vec src, vec shift) %{
4752 match(Set dst (LShiftVI src shift));
4753 match(Set dst (RShiftVI src shift));
4754 match(Set dst (URShiftVI src shift));
4755 effect(TEMP dst, USE src, USE shift);
4756 format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %}
4757 ins_encode %{
4758 int opcode = this->ideal_Opcode();
4759 if (UseAVX > 0) {
4760 int vector_len = vector_length_encoding(this);
4761 __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
4762 } else {
4763 int vlen = vector_length(this);
4764 if (vlen == 2) {
4765 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
4766 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4767 } else {
4768 assert(vlen == 4, "sanity");
4769 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4770 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4771 }
4772 }
4773 %}
4774 ins_pipe( pipe_slow );
4775 %}
4776
4777 // Longs vector shift
4778 instruct vshiftL(vec dst, vec src, vec shift) %{
4779 match(Set dst (LShiftVL src shift));
4780 match(Set dst (URShiftVL src shift));
4781 effect(TEMP dst, USE src, USE shift);
4782 format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %}
4783 ins_encode %{
4784 int opcode = this->ideal_Opcode();
4785 if (UseAVX > 0) {
4786 int vector_len = vector_length_encoding(this);
4787 __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
4788 } else {
4789 assert(vector_length(this) == 2, "");
4790 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4791 __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4792 }
4793 %}
4794 ins_pipe( pipe_slow );
4795 %}
4796
4797 // -------------------ArithmeticRightShift -----------------------------------
4798 // Long vector arithmetic right shift
4799 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4800 predicate(UseAVX <= 2);
4801 match(Set dst (RShiftVL src shift));
4802 effect(TEMP dst, TEMP tmp, TEMP scratch);
4803 format %{ "vshiftq $dst,$src,$shift" %}
4804 ins_encode %{
4805 uint vlen = vector_length(this);
4806 if (vlen == 2) {
4807 assert(UseSSE >= 2, "required");
4808 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4809 __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
4810 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
4811 __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
4812 __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
4813 __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
4814 } else {
4815 assert(vlen == 4, "sanity");
4816 assert(UseAVX > 1, "required");
4817 int vector_len = Assembler::AVX_256bit;
4818 __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
4819 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
4820 __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
4821 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
4822 __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
4823 }
4824 %}
4825 ins_pipe( pipe_slow );
4826 %}
4827
4828 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
4829 predicate(UseAVX > 2);
4830 match(Set dst (RShiftVL src shift));
4831 format %{ "vshiftq $dst,$src,$shift" %}
4832 ins_encode %{
4833 int vector_len = vector_length_encoding(this);
4834 __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
4835 %}
4836 ins_pipe( pipe_slow );
4837 %}
4838
4839 // --------------------------------- AND --------------------------------------
4840
4841 instruct vand(vec dst, vec src) %{
4842 predicate(UseAVX == 0);
4843 match(Set dst (AndV dst src));
4844 format %{ "pand $dst,$src\t! and vectors" %}
4845 ins_encode %{
4846 __ pand($dst$$XMMRegister, $src$$XMMRegister);
4847 %}
4848 ins_pipe( pipe_slow );
4849 %}
4850
4851 instruct vand_reg(vec dst, vec src1, vec src2) %{
4852 predicate(UseAVX > 0);
4853 match(Set dst (AndV src1 src2));
4854 format %{ "vpand $dst,$src1,$src2\t! and vectors" %}
4855 ins_encode %{
4856 int vector_len = vector_length_encoding(this);
4857 __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4858 %}
4859 ins_pipe( pipe_slow );
4860 %}
4861
4862 instruct vand_mem(vec dst, vec src, memory mem) %{
4863 predicate(UseAVX > 0);
4864 match(Set dst (AndV src (LoadVector mem)));
4865 format %{ "vpand $dst,$src,$mem\t! and vectors" %}
4866 ins_encode %{
4867 int vector_len = vector_length_encoding(this);
4868 __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4869 %}
4870 ins_pipe( pipe_slow );
4871 %}
4872
4873 // --------------------------------- OR ---------------------------------------
4874
4875 instruct vor(vec dst, vec src) %{
4876 predicate(UseAVX == 0);
4877 match(Set dst (OrV dst src));
4878 format %{ "por $dst,$src\t! or vectors" %}
4879 ins_encode %{
4880 __ por($dst$$XMMRegister, $src$$XMMRegister);
4881 %}
4882 ins_pipe( pipe_slow );
4883 %}
4884
4885 instruct vor_reg(vec dst, vec src1, vec src2) %{
4886 predicate(UseAVX > 0);
4887 match(Set dst (OrV src1 src2));
4888 format %{ "vpor $dst,$src1,$src2\t! or vectors" %}
4889 ins_encode %{
4890 int vector_len = vector_length_encoding(this);
4891 __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4892 %}
4893 ins_pipe( pipe_slow );
4894 %}
4895
4896 instruct vor_mem(vec dst, vec src, memory mem) %{
4897 predicate(UseAVX > 0);
4898 match(Set dst (OrV src (LoadVector mem)));
4899 format %{ "vpor $dst,$src,$mem\t! or vectors" %}
4900 ins_encode %{
4901 int vector_len = vector_length_encoding(this);
4902 __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4903 %}
4904 ins_pipe( pipe_slow );
4905 %}
4906
4907 // --------------------------------- XOR --------------------------------------
4908
4909 instruct vxor(vec dst, vec src) %{
4910 predicate(UseAVX == 0);
4911 match(Set dst (XorV dst src));
4912 format %{ "pxor $dst,$src\t! xor vectors" %}
4913 ins_encode %{
4914 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
4915 %}
4916 ins_pipe( pipe_slow );
4917 %}
4918
4919 instruct vxor_reg(vec dst, vec src1, vec src2) %{
4920 predicate(UseAVX > 0);
4921 match(Set dst (XorV src1 src2));
4922 format %{ "vpxor $dst,$src1,$src2\t! xor vectors" %}
4923 ins_encode %{
4924 int vector_len = vector_length_encoding(this);
4925 __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4926 %}
4927 ins_pipe( pipe_slow );
4928 %}
4929
4930 instruct vxor_mem(vec dst, vec src, memory mem) %{
4931 predicate(UseAVX > 0);
4932 match(Set dst (XorV src (LoadVector mem)));
4933 format %{ "vpxor $dst,$src,$mem\t! xor vectors" %}
4934 ins_encode %{
4935 int vector_len = vector_length_encoding(this);
4936 __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4937 %}
4938 ins_pipe( pipe_slow );
4939 %}
4940
4941 // --------------------------------- ABS --------------------------------------
4942 // a = |a|
4943 instruct vabsB_reg(vec dst, vec src) %{
4944 match(Set dst (AbsVB src));
4945 format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
4946 ins_encode %{
4947 uint vlen = vector_length(this);
4948 if (vlen <= 16) {
4949 __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
4950 } else {
4951 int vlen_enc = vector_length_encoding(this);
4952 __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
4953 }
4954 %}
4955 ins_pipe( pipe_slow );
4956 %}
4957
4958 instruct vabsS_reg(vec dst, vec src) %{
4959 match(Set dst (AbsVS src));
4960 format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
4961 ins_encode %{
4962 uint vlen = vector_length(this);
4963 if (vlen <= 8) {
4964 __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
4965 } else {
4966 int vlen_enc = vector_length_encoding(this);
4967 __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
4968 }
4969 %}
4970 ins_pipe( pipe_slow );
4971 %}
4972
4973 instruct vabsI_reg(vec dst, vec src) %{
4974 match(Set dst (AbsVI src));
4975 format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
4976 ins_encode %{
4977 uint vlen = vector_length(this);
4978 if (vlen <= 4) {
4979 __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
4980 } else {
4981 int vlen_enc = vector_length_encoding(this);
4982 __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
4983 }
4984 %}
4985 ins_pipe( pipe_slow );
4986 %}
4987
4988 instruct vabsL_reg(vec dst, vec src) %{
4989 match(Set dst (AbsVL src));
4990 format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
4991 ins_encode %{
4992 assert(UseAVX > 2, "required");
4993 int vector_len = vector_length_encoding(this);
4994 __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4995 %}
4996 ins_pipe( pipe_slow );
4997 %}
4998
4999 // --------------------------------- ABSNEG --------------------------------------
5000
5001 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
5002 predicate(n->as_Vector()->length() != 4); // handled by 1-operand instruction vabsneg4F
5003 match(Set dst (AbsVF src));
5004 match(Set dst (NegVF src));
5005 effect(TEMP scratch);
5006 format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
5007 ins_cost(150);
5008 ins_encode %{
5009 int opcode = this->ideal_Opcode();
5010 int vlen = vector_length(this);
5011 if (vlen == 2) {
5012 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
5013 } else {
5014 assert(vlen == 8 || vlen == 16, "required");
5015 int vlen_enc = vector_length_encoding(this);
5016 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
5017 }
5018 %}
5019 ins_pipe( pipe_slow );
5020 %}
5021
5022 instruct vabsneg4F(vec dst, rRegI scratch) %{
5023 predicate(n->as_Vector()->length() == 4);
5024 match(Set dst (AbsVF dst));
5025 match(Set dst (NegVF dst));
5026 effect(TEMP scratch);
5027 format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
5028 ins_cost(150);
5029 ins_encode %{
5030 int opcode = this->ideal_Opcode();
5031 __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
5032 %}
5033 ins_pipe( pipe_slow );
5034 %}
5035
5036 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
5037 match(Set dst (AbsVD src));
5038 match(Set dst (NegVD src));
5039 effect(TEMP scratch);
5040 format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
5041 ins_encode %{
5042 int opcode = this->ideal_Opcode();
5043 uint vlen = vector_length(this);
5044 if (vlen == 2) {
5045 assert(UseSSE >= 2, "required");
5046 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
5047 } else {
5048 int vlen_enc = vector_length_encoding(this);
5049 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
5050 }
5051 %}
5052 ins_pipe( pipe_slow );
5053 %}
5054
5055 // --------------------------------- FMA --------------------------------------
5056 // a * b + c
5057
5058 instruct vfmaF_reg(vec a, vec b, vec c) %{
5059 match(Set c (FmaVF c (Binary a b)));
5060 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
5061 ins_cost(150);
5062 ins_encode %{
5063 assert(UseFMA, "not enabled");
5064 int vector_len = vector_length_encoding(this);
5065 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
5066 %}
5067 ins_pipe( pipe_slow );
5068 %}
5069
5070 instruct vfmaF_mem(vec a, memory b, vec c) %{
5071 match(Set c (FmaVF c (Binary a (LoadVector b))));
5072 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
5073 ins_cost(150);
5074 ins_encode %{
5075 assert(UseFMA, "not enabled");
5076 int vector_len = vector_length_encoding(this);
5077 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
5078 %}
5079 ins_pipe( pipe_slow );
5080 %}
5081
5082 instruct vfmaD_reg(vec a, vec b, vec c) %{
5083 match(Set c (FmaVD c (Binary a b)));
5084 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
5085 ins_cost(150);
5086 ins_encode %{
5087 assert(UseFMA, "not enabled");
5088 int vector_len = vector_length_encoding(this);
5089 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
5090 %}
5091 ins_pipe( pipe_slow );
5092 %}
5093
5094 instruct vfmaD_mem(vec a, memory b, vec c) %{
5095 match(Set c (FmaVD c (Binary a (LoadVector b))));
5096 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
5097 ins_cost(150);
5098 ins_encode %{
5099 assert(UseFMA, "not enabled");
5100 int vector_len = vector_length_encoding(this);
5101 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
5102 %}
5103 ins_pipe( pipe_slow );
5104 %}
5105
5106 // --------------------------------- Vector Multiply Add --------------------------------------
5107
5108 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
5109 predicate(UseAVX == 0);
5110 match(Set dst (MulAddVS2VI dst src1));
5111 format %{ "pmaddwd $dst,$dst,$src1\t! muladd packedStoI" %}
5112 ins_encode %{
5113 __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
5114 %}
5115 ins_pipe( pipe_slow );
5116 %}
5117
5118 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
5119 predicate(UseAVX > 0);
5120 match(Set dst (MulAddVS2VI src1 src2));
5121 format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
5122 ins_encode %{
5123 int vector_len = vector_length_encoding(this);
5124 __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5125 %}
5126 ins_pipe( pipe_slow );
5127 %}
5128
5129 // --------------------------------- Vector Multiply Add Add ----------------------------------
5130
5131 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
5132 predicate(VM_Version::supports_avx512_vnni());
5133 match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
5134 format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
5135 ins_encode %{
5136 assert(UseAVX > 2, "required");
5137 int vector_len = vector_length_encoding(this);
5138 __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5139 %}
5140 ins_pipe( pipe_slow );
5141 ins_cost(10);
5142 %}
5143
5144 // --------------------------------- PopCount --------------------------------------
5145
5146 instruct vpopcountI(vec dst, vec src) %{
5147 match(Set dst (PopCountVI src));
5148 format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
5149 ins_encode %{
5150 assert(UsePopCountInstruction, "not enabled");
5151
5152 int vector_len = vector_length_encoding(this);
5153 __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5154 %}
5155 ins_pipe( pipe_slow );
5156 %}
|
1080 XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p,
1081 XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p,
1082 XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p,
1083 XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p,
1084 XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p
1085 #ifdef _LP64
1086 ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p,
1087 XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p,
1088 XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089 XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090 XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091 XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092 XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093 XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095 );
1096
1097 reg_class_dynamic vectorz_reg (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 %}
1102
1103
1104 //----------SOURCE BLOCK-------------------------------------------------------
1105 // This is a block of C++ code which provides values, functions, and
1106 // definitions necessary in the rest of the architecture description
1107
1108 source_hpp %{
1109 // Header information of the source block.
1110 // Method declarations/definitions which are used outside
1111 // the ad-scope can conveniently be defined here.
1112 //
1113 // To keep related declarations/definitions/uses close together,
1114 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1115
1116 class NativeJump;
1117
1118 class CallStubImpl {
1119
1120 //--------------------------------------------------------------
1149 return NativeJump::instruction_size;
1150 }
1151
1152 #ifdef _LP64
1153 static uint size_deopt_handler() {
1154 // three 5 byte instructions plus one move for unreachable address.
1155 return 15+3;
1156 }
1157 #else
1158 static uint size_deopt_handler() {
1159 // NativeCall instruction size is the same as NativeJump.
1160 // exception handler starts out as jump and can be patched to
1161 // a call be deoptimization. (4932387)
1162 // Note that this value is also credited (in output.cpp) to
1163 // the size of the code section.
1164 return 5 + NativeJump::instruction_size; // pushl(); jmp;
1165 }
1166 #endif
1167 };
1168
1169
1170 inline uint vector_length(const Node* n) {
1171 const TypeVect* vt = n->bottom_type()->is_vect();
1172 return vt->length();
1173 }
1174
1175 inline uint vector_length(const MachNode* use, MachOper* opnd) {
1176 uint def_idx = use->operand_index(opnd);
1177 Node* def = use->in(def_idx);
1178 return def->bottom_type()->is_vect()->length();
1179 }
1180
1181 inline uint vector_length_in_bytes(const Node* n) {
1182 const TypeVect* vt = n->bottom_type()->is_vect();
1183 return vt->length_in_bytes();
1184 }
1185
1186 inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1187 uint def_idx = use->operand_index(opnd);
1188 Node* def = use->in(def_idx);
1189 return def->bottom_type()->is_vect()->length_in_bytes();
1190 }
1191
1192 inline BasicType vector_element_basic_type(const Node *n) {
1193 return n->bottom_type()->is_vect()->element_basic_type();
1194 }
1195
1196 inline BasicType vector_element_basic_type(const MachNode *use, MachOper* opnd) {
1197 uint def_idx = use->operand_index(opnd);
1198 Node* def = use->in(def_idx);
1199 return def->bottom_type()->is_vect()->element_basic_type();
1200 }
1201
1202 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1203 switch(bytes) {
1204 case 4: // fall-through
1205 case 8: // fall-through
1206 case 16: return Assembler::AVX_128bit;
1207 case 32: return Assembler::AVX_256bit;
1208 case 64: return Assembler::AVX_512bit;
1209
1210 default: {
1211 ShouldNotReachHere();
1212 return Assembler::AVX_NoVec;
1213 }
1214 }
1215 }
1216
1217 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1218 return vector_length_encoding(vector_length_in_bytes(n));
1219 }
1220
1221 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1222 uint def_idx = use->operand_index(opnd);
1223 Node* def = use->in(def_idx);
1224 return vector_length_encoding(def);
1225 }
1226
1227 %} // end source_hpp
1228
1229 source %{
1230
1231 #include "opto/addnode.hpp"
1232
1233 // Emit exception handler code.
1234 // Stuff framesize into a register and call a VM stub routine.
1235 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1236
1237 // Note that the code buffer's insts_mark is always relative to insts.
1238 // That's why we must use the macroassembler to generate a handler.
1239 C2_MacroAssembler _masm(&cbuf);
1240 address base = __ start_a_stub(size_exception_handler());
1241 if (base == NULL) {
1242 ciEnv::current()->record_failure("CodeCache is full");
1243 return 0; // CodeBuffer::expand failed
1244 }
1245 int offset = __ offset();
1246 __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1267 Label next;
1268 // push a "the_pc" on the stack without destroying any registers
1269 // as they all may be live.
1270
1271 // push address of "next"
1272 __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1273 __ bind(next);
1274 // adjust it so it matches "the_pc"
1275 __ subptr(Address(rsp, 0), __ offset() - offset);
1276 #else
1277 InternalAddress here(__ pc());
1278 __ pushptr(here.addr());
1279 #endif
1280
1281 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1282 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1283 __ end_a_stub();
1284 return offset;
1285 }
1286
1287 Assembler::Width widthForType(BasicType bt) {
1288 if (bt == T_BYTE) {
1289 return Assembler::B;
1290 } else if (bt == T_SHORT) {
1291 return Assembler::W;
1292 } else if (bt == T_INT) {
1293 return Assembler::D;
1294 } else {
1295 assert(bt == T_LONG, "not a long: %s", type2name(bt));
1296 return Assembler::Q;
1297 }
1298 }
1299
1300 //=============================================================================
1301
1302 // Float masks come from different places depending on platform.
1303 #ifdef _LP64
1304 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); }
1305 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); }
1306 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1307 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1308 #else
1309 static address float_signmask() { return (address)float_signmask_pool; }
1310 static address float_signflip() { return (address)float_signflip_pool; }
1311 static address double_signmask() { return (address)double_signmask_pool; }
1312 static address double_signflip() { return (address)double_signflip_pool; }
1313 #endif
1314 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1315 static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1316 static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1317 static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1318 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1319 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1320 static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1321 static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1322 static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1323 static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1324 static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1325
1326 //=============================================================================
1327 const bool Matcher::match_rule_supported(int opcode) {
1328 if (!has_match_rule(opcode)) {
1329 return false; // no match rule present
1330 }
1331 switch (opcode) {
1332 case Op_AbsVL:
1333 case Op_StoreVectorScatter:
1334 if (UseAVX < 3) {
1335 return false;
1336 }
1337 break;
1338 case Op_PopCountI:
1339 case Op_PopCountL:
1340 if (!UsePopCountInstruction) {
1341 return false;
1342 }
1343 break;
1344 case Op_PopCountVI:
1345 if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1346 return false;
1347 }
1348 break;
1349 case Op_MulVI:
1350 if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1351 return false;
1352 }
1353 break;
1354 case Op_MulVL:
1355 if (UseSSE < 4) { // only with SSE4_1 or AVX
1356 return false;
1357 }
1358 break;
1359 case Op_MulReductionVL:
1360 if (VM_Version::supports_avx512dq() == false) {
1361 return false;
1362 }
1363 break;
1364 case Op_AddReductionVL:
1365 if (UseSSE < 2) { // requires at least SSE2
1366 return false;
1367 }
1368 break;
1369 case Op_AbsVB:
1370 case Op_AbsVS:
1371 case Op_AbsVI:
1372 case Op_AddReductionVI:
1373 case Op_AndReductionV:
1374 case Op_OrReductionV:
1375 case Op_XorReductionV:
1376 if (UseSSE < 3) { // requires at least SSSE3
1377 return false;
1378 }
1379 break;
1380 case Op_VectorLoadShuffle:
1381 case Op_VectorRearrange:
1382 case Op_MulReductionVI:
1383 if (UseSSE < 4) { // requires at least SSE4
1384 return false;
1385 }
1386 break;
1387 case Op_SqrtVD:
1388 case Op_SqrtVF:
1389 case Op_VectorMaskCmp:
1390 case Op_VectorCastB2X:
1391 case Op_VectorCastS2X:
1392 case Op_VectorCastI2X:
1393 case Op_VectorCastL2X:
1394 case Op_VectorCastF2X:
1395 case Op_VectorCastD2X:
1396 if (UseAVX < 1) { // enabled for AVX only
1397 return false;
1398 }
1399 break;
1400 case Op_CompareAndSwapL:
1401 #ifdef _LP64
1402 case Op_CompareAndSwapP:
1403 #endif
1404 if (!VM_Version::supports_cx8()) {
1405 return false;
1406 }
1407 break;
1408 case Op_CMoveVF:
1409 case Op_CMoveVD:
1410 if (UseAVX < 1) { // enabled for AVX only
1411 return false;
1412 }
1413 break;
1414 case Op_StrIndexOf:
1415 if (!UseSSE42Intrinsics) {
1416 return false;
1417 }
1418 break;
1419 case Op_StrIndexOfChar:
1420 if (!UseSSE42Intrinsics) {
1421 return false;
1422 }
1423 break;
1424 case Op_OnSpinWait:
1425 if (VM_Version::supports_on_spin_wait() == false) {
1426 return false;
1427 }
1428 break;
1429 case Op_MulVB:
1430 case Op_LShiftVB:
1431 case Op_RShiftVB:
1432 case Op_URShiftVB:
1433 case Op_VectorInsert:
1434 case Op_VectorLoadMask:
1435 case Op_VectorStoreMask:
1436 case Op_VectorBlend:
1437 if (UseSSE < 4) {
1438 return false;
1439 }
1440 break;
1441 #ifdef _LP64
1442 case Op_MaxD:
1443 case Op_MaxF:
1444 case Op_MinD:
1445 case Op_MinF:
1446 if (UseAVX < 1) { // enabled for AVX only
1447 return false;
1448 }
1449 break;
1450 #endif
1451 case Op_CacheWB:
1452 case Op_CacheWBPreSync:
1453 case Op_CacheWBPostSync:
1454 if (!VM_Version::supports_data_cache_line_flush()) {
1455 return false;
1456 }
1457 break;
1458 case Op_ExtractB:
1459 case Op_ExtractL:
1460 case Op_ExtractI:
1461 case Op_RoundDoubleMode:
1462 if (UseSSE < 4) {
1463 return false;
1464 }
1465 break;
1466 case Op_RoundDoubleModeV:
1467 if (VM_Version::supports_avx() == false) {
1468 return false; // 128bit vroundpd is not available
1469 }
1470 break;
1471 case Op_VLShiftV:
1472 case Op_VRShiftV:
1473 case Op_VURShiftV:
1474 case Op_LoadVectorGather:
1475 if (UseAVX < 2) {
1476 return false;
1477 }
1478 break;
1479 case Op_FmaVD:
1480 case Op_FmaVF:
1481 if (!UseFMA) {
1482 return false;
1483 }
1484 break;
1485 #ifndef _LP64
1486 case Op_AddReductionVF:
1487 case Op_AddReductionVD:
1488 case Op_MulReductionVF:
1489 case Op_MulReductionVD:
1490 if (UseSSE < 1) { // requires at least SSE
1491 return false;
1492 }
1493 break;
1494 case Op_MulAddVS2VI:
1495 case Op_RShiftVL:
1496 case Op_AbsVD:
1497 case Op_NegVD:
1498 if (UseSSE < 2) {
1499 return false;
1500 }
1501 break;
1502 #endif // !LP64
1503 }
1504 return true; // Match rules are supported by default.
1505 }
1506
1507 //------------------------------------------------------------------------
1508
1509 // Identify extra cases that we might want to provide match rules for vector nodes and
1510 // other intrinsics guarded with vector length (vlen) and element type (bt).
1511 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1512 if (!match_rule_supported(opcode)) {
1513 return false;
1514 }
1515 // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1516 // * SSE2 supports 128bit vectors for all types;
1517 // * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1518 // * AVX2 supports 256bit vectors for all types;
1519 // * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1520 // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1521 // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1522 // And MaxVectorSize is taken into account as well.
1523
1524 int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1525 if (!vector_size_supported(bt, vlen)) {
1526 return false;
1527 }
1528 // Special cases which require vector length follow:
1529 // * implementation limitations
1530 // * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1531 // * 128bit vroundpd instruction is present only in AVX1
1532 switch (opcode) {
1533 case Op_AbsVF:
1534 case Op_NegVF:
1535 if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1536 return false; // 512bit vandps and vxorps are not available
1537 }
1538 break;
1539 case Op_AbsVD:
1540 case Op_NegVD:
1541 case Op_MulVL:
1542 if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1543 return false; // 512bit vpmullq, vandpd and vxorpd are not available
1544 }
1545 break;
1546 case Op_CMoveVF:
1547 if (vlen != 8) {
1548 return false; // implementation limitation (only vcmov8F_reg is present)
1549 }
1550 break;
1551 case Op_CMoveVD:
1552 if (vlen != 4) {
1553 return false; // implementation limitation (only vcmov4D_reg is present)
1554 }
1555 break;
1556 case Op_MaxV:
1557 case Op_MinV:
1558 if (UseSSE < 4 && is_integral_type(bt)) {
1559 return false;
1560 }
1561 if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1562 // Float/Double intrinsics are enabled for AVX family currently.
1563 if (UseAVX == 0) {
1564 return false;
1565 }
1566 if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1567 return false;
1568 }
1569 }
1570 break;
1571 case Op_AddReductionVI:
1572 if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1573 return false;
1574 }
1575 // fallthrough
1576 case Op_AndReductionV:
1577 case Op_OrReductionV:
1578 case Op_XorReductionV:
1579 if (is_subword_type(bt) && (UseSSE < 4)) {
1580 return false;
1581 }
1582 #ifndef _LP64
1583 if (bt == T_BYTE || bt == T_LONG) {
1584 return false;
1585 }
1586 #endif
1587 break;
1588 #ifndef _LP64
1589 case Op_VectorInsert:
1590 if (bt == T_LONG || bt == T_DOUBLE) {
1591 return false;
1592 }
1593 break;
1594 #endif
1595 case Op_MinReductionV:
1596 case Op_MaxReductionV:
1597 if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1598 return false;
1599 } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1600 return false;
1601 }
1602 // Float/Double intrinsics enabled for AVX family.
1603 if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1604 return false;
1605 }
1606 if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1607 return false;
1608 }
1609 #ifndef _LP64
1610 if (bt == T_BYTE || bt == T_LONG) {
1611 return false;
1612 }
1613 #endif
1614 break;
1615 case Op_VectorTest:
1616 if (UseSSE < 4) {
1617 return false; // Implementation limitation
1618 } else if (size_in_bits < 128) {
1619 return false; // Implementation limitation
1620 } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1621 return false; // Implementation limitation
1622 }
1623 break;
1624 case Op_VectorLoadShuffle:
1625 case Op_VectorRearrange:
1626 if(vlen == 2) {
1627 return false; // Implementation limitation due to how shuffle is loaded
1628 } else if (size_in_bits == 256 && UseAVX < 2) {
1629 return false; // Implementation limitation
1630 } else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512_vbmi()) {
1631 return false; // Implementation limitation
1632 } else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512bw()) {
1633 return false; // Implementation limitation
1634 }
1635 break;
1636 case Op_VectorLoadMask:
1637 if (size_in_bits == 256 && UseAVX < 2) {
1638 return false; // Implementation limitation
1639 }
1640 // fallthrough
1641 case Op_VectorStoreMask:
1642 if (vlen == 2) {
1643 return false; // Implementation limitation
1644 }
1645 break;
1646 case Op_VectorCastB2X:
1647 if (size_in_bits == 256 && UseAVX < 2) {
1648 return false; // Implementation limitation
1649 }
1650 break;
1651 case Op_VectorCastS2X:
1652 if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1653 return false;
1654 }
1655 break;
1656 case Op_VectorCastI2X:
1657 if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1658 return false;
1659 }
1660 break;
1661 case Op_VectorCastL2X:
1662 if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1663 return false;
1664 } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1665 return false;
1666 }
1667 break;
1668 case Op_VectorCastF2X:
1669 case Op_VectorCastD2X:
1670 if (is_integral_type(bt)) {
1671 // Casts from FP to integral types require special fixup logic not easily
1672 // implementable with vectors.
1673 return false; // Implementation limitation
1674 }
1675 case Op_MulReductionVI:
1676 if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1677 return false;
1678 }
1679 break;
1680 case Op_StoreVectorScatter:
1681 if(bt == T_BYTE || bt == T_SHORT) {
1682 return false;
1683 } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1684 return false;
1685 }
1686 // fallthrough
1687 case Op_LoadVectorGather:
1688 if (size_in_bits == 64 ) {
1689 return false;
1690 }
1691 break;
1692 }
1693 return true; // Per default match rules are supported.
1694 }
1695
1696 // x86 supports generic vector operands: vec and legVec.
1697 const bool Matcher::supports_generic_vector_operands = true;
1698
1699 MachOper* Matcher::specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1700 assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1701 bool legacy = (generic_opnd->opcode() == LEGVEC);
1702 if (!VM_Version::supports_avx512vlbwdq() && // KNL
1703 is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1704 // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1705 return new legVecZOper();
1706 }
1707 if (legacy) {
1708 switch (ideal_reg) {
1709 case Op_VecS: return new legVecSOper();
1710 case Op_VecD: return new legVecDOper();
1711 case Op_VecX: return new legVecXOper();
1909 } else {
1910 mstack.push(adr, Pre_Visit);
1911 }
1912
1913 // Clone X+offset as it also folds into most addressing expressions
1914 mstack.push(off, Visit);
1915 mstack.push(m->in(AddPNode::Base), Pre_Visit);
1916 return true;
1917 } else if (clone_shift(off, this, mstack, address_visited)) {
1918 address_visited.test_set(m->_idx); // Flag as address_visited
1919 mstack.push(m->in(AddPNode::Address), Pre_Visit);
1920 mstack.push(m->in(AddPNode::Base), Pre_Visit);
1921 return true;
1922 }
1923 return false;
1924 }
1925
1926 void Compile::reshape_address(AddPNode* addp) {
1927 }
1928
1929 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
1930 switch (bt) {
1931 case BoolTest::eq: return Assembler::eq;
1932 case BoolTest::ne: return Assembler::neq;
1933 case BoolTest::le: return Assembler::le;
1934 case BoolTest::ge: return Assembler::nlt;
1935 case BoolTest::lt: return Assembler::lt;
1936 case BoolTest::gt: return Assembler::nle;
1937 default : ShouldNotReachHere(); return Assembler::_false;
1938 }
1939 }
1940
1941 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
1942 switch (bt) {
1943 case BoolTest::eq: return Assembler::EQ_OQ; // ordered non-signaling
1944 // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
1945 case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
1946 case BoolTest::le: return Assembler::LE_OQ; // ordered non-signaling
1947 case BoolTest::ge: return Assembler::GE_OQ; // ordered non-signaling
1948 case BoolTest::lt: return Assembler::LT_OQ; // ordered non-signaling
1949 case BoolTest::gt: return Assembler::GT_OQ; // ordered non-signaling
1950 default: ShouldNotReachHere(); return Assembler::FALSE_OS;
1951 }
1952 }
1953
1954 // Helper methods for MachSpillCopyNode::implementation().
1955 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1956 int src_hi, int dst_hi, uint ireg, outputStream* st) {
1957 // In 64-bit VM size calculation is very complex. Emitting instructions
1958 // into scratch buffer is used to get size in 64-bit VM.
1959 LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1960 assert(ireg == Op_VecS || // 32bit vector
1961 (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1962 (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1963 "no non-adjacent vector moves" );
1964 if (cbuf) {
1965 C2_MacroAssembler _masm(cbuf);
1966 int offset = __ offset();
1967 switch (ireg) {
1968 case Op_VecS: // copy whole register
1969 case Op_VecD:
1970 case Op_VecX:
2257 %}
2258
2259 encode %{
2260
2261 enc_class call_epilog %{
2262 if (VerifyStackAtCalls) {
2263 // Check that stack depth is unchanged: find majik cookie on stack
2264 int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2265 C2_MacroAssembler _masm(&cbuf);
2266 Label L;
2267 __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2268 __ jccb(Assembler::equal, L);
2269 // Die if stack mismatch
2270 __ int3();
2271 __ bind(L);
2272 }
2273 %}
2274
2275 %}
2276
2277 // Operands for bound floating pointer register arguments
2278 operand rxmm0() %{
2279 constraint(ALLOC_IN_RC(xmm0_reg));
2280 match(VecX);
2281 format%{%}
2282 interface(REG_INTER);
2283 %}
2284
2285 //----------OPERANDS-----------------------------------------------------------
2286 // Operand definitions must precede instruction definitions for correct parsing
2287 // in the ADLC because operands constitute user defined types which are used in
2288 // instruction definitions.
2289
2290 // Vectors
2291
2292 // Dummy generic vector class. Should be used for all vector operands.
2293 // Replaced with vec[SDXYZ] during post-selection pass.
2294 operand vec() %{
2295 constraint(ALLOC_IN_RC(dynamic));
2296 match(VecX);
2297 match(VecY);
2298 match(VecZ);
2299 match(VecS);
2300 match(VecD);
2301
2302 format %{ %}
2303 interface(REG_INTER);
3028 ins_pipe(pipe_slow);
3029 %}
3030
3031 instruct absF_reg(regF dst) %{
3032 predicate((UseSSE>=1) && (UseAVX == 0));
3033 match(Set dst (AbsF dst));
3034 ins_cost(150);
3035 format %{ "andps $dst, [0x7fffffff]\t# abs float by sign masking" %}
3036 ins_encode %{
3037 __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3038 %}
3039 ins_pipe(pipe_slow);
3040 %}
3041
3042 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3043 predicate(UseAVX > 0);
3044 match(Set dst (AbsF src));
3045 ins_cost(150);
3046 format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3047 ins_encode %{
3048 int vlen_enc = Assembler::AVX_128bit;
3049 __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3050 ExternalAddress(float_signmask()), vlen_enc);
3051 %}
3052 ins_pipe(pipe_slow);
3053 %}
3054
3055 instruct absD_reg(regD dst) %{
3056 predicate((UseSSE>=2) && (UseAVX == 0));
3057 match(Set dst (AbsD dst));
3058 ins_cost(150);
3059 format %{ "andpd $dst, [0x7fffffffffffffff]\t"
3060 "# abs double by sign masking" %}
3061 ins_encode %{
3062 __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3063 %}
3064 ins_pipe(pipe_slow);
3065 %}
3066
3067 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3068 predicate(UseAVX > 0);
3069 match(Set dst (AbsD src));
3070 ins_cost(150);
3071 format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t"
3072 "# abs double by sign masking" %}
3073 ins_encode %{
3074 int vlen_enc = Assembler::AVX_128bit;
3075 __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3076 ExternalAddress(double_signmask()), vlen_enc);
3077 %}
3078 ins_pipe(pipe_slow);
3079 %}
3080
3081 instruct negF_reg(regF dst) %{
3082 predicate((UseSSE>=1) && (UseAVX == 0));
3083 match(Set dst (NegF dst));
3084 ins_cost(150);
3085 format %{ "xorps $dst, [0x80000000]\t# neg float by sign flipping" %}
3086 ins_encode %{
3087 __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3088 %}
3089 ins_pipe(pipe_slow);
3090 %}
3091
3092 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3093 predicate(UseAVX > 0);
3094 match(Set dst (NegF src));
3095 ins_cost(150);
3096 format %{ "vnegatess $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3180
3181 format %{ "sqrtsd $dst, $src" %}
3182 ins_cost(150);
3183 ins_encode %{
3184 __ sqrtsd($dst$$XMMRegister, $src$$Address);
3185 %}
3186 ins_pipe(pipe_slow);
3187 %}
3188
3189 instruct sqrtD_imm(regD dst, immD con) %{
3190 predicate(UseSSE>=2);
3191 match(Set dst (SqrtD con));
3192 format %{ "sqrtsd $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3193 ins_cost(150);
3194 ins_encode %{
3195 __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3196 %}
3197 ins_pipe(pipe_slow);
3198 %}
3199
3200 // ---------------------------------------- VectorReinterpret ------------------------------------
3201
3202 instruct reinterpret(vec dst) %{
3203 predicate(vector_length_in_bytes(n) == vector_length_in_bytes(n->in(1))); // dst == src
3204 match(Set dst (VectorReinterpret dst));
3205 ins_cost(125);
3206 format %{ "vector_reinterpret $dst\t!" %}
3207 ins_encode %{
3208 // empty
3209 %}
3210 ins_pipe( pipe_slow );
3211 %}
3212
3213 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3214 predicate(UseAVX == 0 &&
3215 (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3216 match(Set dst (VectorReinterpret src));
3217 ins_cost(125);
3218 effect(TEMP dst, TEMP scratch);
3219 format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3220 ins_encode %{
3221 assert(vector_length_in_bytes(this) <= 16, "required");
3222 assert(vector_length_in_bytes(this, $src) <= 8, "required");
3223
3224 int src_vlen_in_bytes = vector_length_in_bytes(this, $src);
3225 if (src_vlen_in_bytes == 4) {
3226 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3227 } else {
3228 assert(src_vlen_in_bytes == 8, "");
3229 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3230 }
3231 __ pand($dst$$XMMRegister, $src$$XMMRegister);
3232 %}
3233 ins_pipe( pipe_slow );
3234 %}
3235
3236 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3237 predicate(UseAVX > 0 &&
3238 (vector_length_in_bytes(n->in(1)) == 4) && // src
3239 (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3240 match(Set dst (VectorReinterpret src));
3241 ins_cost(125);
3242 effect(TEMP scratch);
3243 format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3244 ins_encode %{
3245 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3246 %}
3247 ins_pipe( pipe_slow );
3248 %}
3249
3250
3251 instruct vreinterpret_expand(legVec dst, vec src) %{
3252 predicate(UseAVX > 0 &&
3253 (vector_length_in_bytes(n->in(1)) > 4) && // src
3254 (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3255 match(Set dst (VectorReinterpret src));
3256 ins_cost(125);
3257 format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3258 ins_encode %{
3259 switch (vector_length_in_bytes(this, $src)) {
3260 case 8: __ movq ($dst$$XMMRegister, $src$$XMMRegister); break;
3261 case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3262 case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3263 default: ShouldNotReachHere();
3264 }
3265 %}
3266 ins_pipe( pipe_slow );
3267 %}
3268
3269 instruct reinterpret_shrink(vec dst, legVec src) %{
3270 predicate(vector_length_in_bytes(n->in(1)) > vector_length_in_bytes(n)); // src > dst
3271 match(Set dst (VectorReinterpret src));
3272 ins_cost(125);
3273 format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3274 ins_encode %{
3275 switch (vector_length_in_bytes(this)) {
3276 case 4: __ movflt ($dst$$XMMRegister, $src$$XMMRegister); break;
3277 case 8: __ movq ($dst$$XMMRegister, $src$$XMMRegister); break;
3278 case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3279 case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3280 default: ShouldNotReachHere();
3281 }
3282 %}
3283 ins_pipe( pipe_slow );
3284 %}
3285
3286 // ----------------------------------------------------------------------------------------------------
3287
3288 #ifdef _LP64
3289 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3290 match(Set dst (RoundDoubleMode src rmode));
3291 format %{ "roundsd $dst,$src" %}
3292 ins_cost(150);
3293 ins_encode %{
3294 assert(UseSSE >= 4, "required");
3295 __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3296 %}
3297 ins_pipe(pipe_slow);
3298 %}
3299
3300 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3301 match(Set dst (RoundDoubleMode (LoadD src) rmode));
3302 format %{ "roundsd $dst,$src" %}
3303 ins_cost(150);
3304 ins_encode %{
3305 assert(UseSSE >= 4, "required");
3306 __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3307 %}
3308 ins_pipe(pipe_slow);
3309 %}
3310
3311 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3312 match(Set dst (RoundDoubleMode con rmode));
3313 effect(TEMP scratch_reg);
3314 format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3315 ins_cost(150);
3316 ins_encode %{
3317 assert(UseSSE >= 4, "required");
3318 __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3319 %}
3320 ins_pipe(pipe_slow);
3321 %}
3322
3323 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3324 predicate(vector_length(n) < 8);
3325 match(Set dst (RoundDoubleModeV src rmode));
3326 format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3327 ins_encode %{
3328 assert(UseAVX > 0, "required");
3329 int vlen_enc = vector_length_encoding(this);
3330 __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3331 %}
3332 ins_pipe( pipe_slow );
3333 %}
3334
3335 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3336 predicate(vector_length(n) == 8);
3337 match(Set dst (RoundDoubleModeV src rmode));
3338 format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3339 ins_encode %{
3340 assert(UseAVX > 2, "required");
3341 __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3342 %}
3343 ins_pipe( pipe_slow );
3344 %}
3345
3346 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3347 predicate(vector_length(n) < 8);
3348 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3349 format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3350 ins_encode %{
3351 assert(UseAVX > 0, "required");
3352 int vlen_enc = vector_length_encoding(this);
3353 __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3354 %}
3355 ins_pipe( pipe_slow );
3356 %}
3357
3358 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3359 predicate(vector_length(n) == 8);
3360 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3361 format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3362 ins_encode %{
3363 assert(UseAVX > 2, "required");
3364 __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3365 %}
3366 ins_pipe( pipe_slow );
3367 %}
3368 #endif // _LP64
3369
3370 instruct onspinwait() %{
3371 match(OnSpinWait);
3372 ins_cost(200);
3373
3374 format %{
3375 $$template
3376 $$emit$$"pause\t! membar_onspinwait"
3377 %}
3378 ins_encode %{
3379 __ pause();
3411 instruct MoveVec2Leg(legVec dst, vec src) %{
3412 match(Set dst src);
3413 format %{ "" %}
3414 ins_encode %{
3415 ShouldNotReachHere();
3416 %}
3417 ins_pipe( fpu_reg_reg );
3418 %}
3419
3420 instruct MoveLeg2Vec(vec dst, legVec src) %{
3421 match(Set dst src);
3422 format %{ "" %}
3423 ins_encode %{
3424 ShouldNotReachHere();
3425 %}
3426 ins_pipe( fpu_reg_reg );
3427 %}
3428
3429 // ============================================================================
3430
3431 // Load vectors generic operand pattern
3432 instruct loadV(vec dst, memory mem) %{
3433 match(Set dst (LoadVector mem));
3434 ins_cost(125);
3435 format %{ "load_vector $dst,$mem" %}
3436 ins_encode %{
3437 switch (vector_length_in_bytes(this)) {
3438 case 4: __ movdl ($dst$$XMMRegister, $mem$$Address); break;
3439 case 8: __ movq ($dst$$XMMRegister, $mem$$Address); break;
3440 case 16: __ movdqu ($dst$$XMMRegister, $mem$$Address); break;
3441 case 32: __ vmovdqu ($dst$$XMMRegister, $mem$$Address); break;
3442 case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3443 default: ShouldNotReachHere();
3444 }
3445 %}
3446 ins_pipe( pipe_slow );
3447 %}
3448
3449 // Store vectors generic operand pattern.
3450 instruct storeV(memory mem, vec src) %{
3451 match(Set mem (StoreVector mem src));
3452 ins_cost(145);
3453 format %{ "store_vector $mem,$src\n\t" %}
3454 ins_encode %{
3455 switch (vector_length_in_bytes(this, $src)) {
3456 case 4: __ movdl ($mem$$Address, $src$$XMMRegister); break;
3457 case 8: __ movq ($mem$$Address, $src$$XMMRegister); break;
3458 case 16: __ movdqu ($mem$$Address, $src$$XMMRegister); break;
3459 case 32: __ vmovdqu ($mem$$Address, $src$$XMMRegister); break;
3460 case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3461 default: ShouldNotReachHere();
3462 }
3463 %}
3464 ins_pipe( pipe_slow );
3465 %}
3466
3467 // ---------------------------------------- Gather ------------------------------------
3468
3469 // Gather INT, LONG, FLOAT, DOUBLE
3470
3471 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3472 predicate(vector_length_in_bytes(n) <= 32);
3473 match(Set dst (LoadVectorGather mem idx));
3474 effect(TEMP dst, TEMP tmp, TEMP mask);
3475 format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3476 ins_encode %{
3477 assert(UseAVX >= 2, "sanity");
3478
3479 int vlen_enc = vector_length_encoding(this);
3480 BasicType elem_bt = vector_element_basic_type(this);
3481
3482 assert(vector_length_in_bytes(this) >= 16, "sanity");
3483 assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3484
3485 if (vlen_enc == Assembler::AVX_128bit) {
3486 __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3487 } else {
3488 __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3489 }
3490 __ lea($tmp$$Register, $mem$$Address);
3491 __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3492 %}
3493 ins_pipe( pipe_slow );
3494 %}
3495
3496 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp) %{
3497 predicate(vector_length_in_bytes(n) == 64);
3498 match(Set dst (LoadVectorGather mem idx));
3499 effect(TEMP dst, TEMP tmp);
3500 format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %}
3501 ins_encode %{
3502 assert(UseAVX > 2, "sanity");
3503
3504 int vlen_enc = vector_length_encoding(this);
3505 BasicType elem_bt = vector_element_basic_type(this);
3506
3507 assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3508
3509 KRegister ktmp = k2;
3510 __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3511 __ lea($tmp$$Register, $mem$$Address);
3512 __ evgather(elem_bt, $dst$$XMMRegister, ktmp, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3513 %}
3514 ins_pipe( pipe_slow );
3515 %}
3516
3517 // ====================Scatter=======================================
3518
3519 // Scatter INT, LONG, FLOAT, DOUBLE
3520
3521 instruct scatter(memory mem, vec src, vec idx, rRegP tmp) %{
3522 match(Set mem (StoreVectorScatter mem (Binary src idx)));
3523 effect(TEMP tmp);
3524 format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3525 ins_encode %{
3526 assert(UseAVX > 2, "sanity");
3527
3528 int vlen_enc = vector_length_encoding(this, $src);
3529 BasicType elem_bt = vector_element_basic_type(this, $src);
3530
3531 assert(vector_length_in_bytes(this, $src) >= 16, "sanity");
3532 assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3533
3534 KRegister ktmp = k2;
3535 __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3536 __ lea($tmp$$Register, $mem$$Address);
3537 __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, ktmp, $src$$XMMRegister, vlen_enc);
3538 %}
3539 ins_pipe( pipe_slow );
3540 %}
3541
3542 // ====================REPLICATE=======================================
3543
3544 // Replicate byte scalar to be vector
3545 instruct ReplB_reg(vec dst, rRegI src) %{
3546 match(Set dst (ReplicateB src));
3547 format %{ "replicateB $dst,$src" %}
3548 ins_encode %{
3549 uint vlen = vector_length(this);
3550 if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3551 assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3552 int vlen_enc = vector_length_encoding(this);
3553 __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3554 } else {
3555 __ movdl($dst$$XMMRegister, $src$$Register);
3556 __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3557 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3558 if (vlen >= 16) {
3559 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3560 if (vlen >= 32) {
3561 assert(vlen == 32, "sanity");
3562 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3563 }
3564 }
3565 }
3566 %}
3567 ins_pipe( pipe_slow );
3568 %}
3569
3570 instruct ReplB_mem(vec dst, memory mem) %{
3571 predicate(VM_Version::supports_avx2());
3572 match(Set dst (ReplicateB (LoadB mem)));
3573 format %{ "replicateB $dst,$mem" %}
3574 ins_encode %{
3575 int vlen_enc = vector_length_encoding(this);
3576 __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3577 %}
3578 ins_pipe( pipe_slow );
3579 %}
3580
3581 instruct ReplB_imm(vec dst, immI con) %{
3582 match(Set dst (ReplicateB con));
3583 format %{ "replicateB $dst,$con" %}
3584 ins_encode %{
3585 uint vlen = vector_length(this);
3586 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3587 if (vlen == 4) {
3588 __ movdl($dst$$XMMRegister, const_addr);
3589 } else {
3590 __ movq($dst$$XMMRegister, const_addr);
3591 if (vlen >= 16) {
3592 if (VM_Version::supports_avx2()) {
3593 int vlen_enc = vector_length_encoding(this);
3594 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3595 } else {
3596 assert(vlen == 16, "sanity");
3597 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3598 }
3599 }
3600 }
3601 %}
3602 ins_pipe( pipe_slow );
3603 %}
3604
3605 // Replicate byte scalar zero to be vector
3606 instruct ReplB_zero(vec dst, immI_0 zero) %{
3607 match(Set dst (ReplicateB zero));
3608 format %{ "replicateB $dst,$zero" %}
3609 ins_encode %{
3610 uint vlen = vector_length(this);
3611 if (vlen <= 16) {
3612 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3613 } else {
3614 // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3615 int vlen_enc = vector_length_encoding(this);
3616 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3617 }
3618 %}
3619 ins_pipe( fpu_reg_reg );
3620 %}
3621
3622 // ====================ReplicateS=======================================
3623
3624 instruct ReplS_reg(vec dst, rRegI src) %{
3625 match(Set dst (ReplicateS src));
3626 format %{ "replicateS $dst,$src" %}
3663 uint vlen = vector_length(this);
3664 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3665 if (vlen == 2) {
3666 __ movdl($dst$$XMMRegister, const_addr);
3667 } else {
3668 __ movq($dst$$XMMRegister, const_addr);
3669 if (vlen >= 8) {
3670 if (VM_Version::supports_avx2()) {
3671 int vlen_enc = vector_length_encoding(this);
3672 __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3673 } else {
3674 assert(vlen == 8, "sanity");
3675 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3676 }
3677 }
3678 }
3679 %}
3680 ins_pipe( fpu_reg_reg );
3681 %}
3682
3683 instruct ReplS_zero(vec dst, immI_0 zero) %{
3684 match(Set dst (ReplicateS zero));
3685 format %{ "replicateS $dst,$zero" %}
3686 ins_encode %{
3687 uint vlen = vector_length(this);
3688 if (vlen <= 8) {
3689 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3690 } else {
3691 int vlen_enc = vector_length_encoding(this);
3692 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3693 }
3694 %}
3695 ins_pipe( fpu_reg_reg );
3696 %}
3697
3698 // ====================ReplicateI=======================================
3699
3700 instruct ReplI_reg(vec dst, rRegI src) %{
3701 match(Set dst (ReplicateI src));
3702 format %{ "replicateI $dst,$src" %}
3703 ins_encode %{
3710 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3711 if (vlen >= 8) {
3712 assert(vlen == 8, "sanity");
3713 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3714 }
3715 }
3716 %}
3717 ins_pipe( pipe_slow );
3718 %}
3719
3720 instruct ReplI_mem(vec dst, memory mem) %{
3721 match(Set dst (ReplicateI (LoadI mem)));
3722 format %{ "replicateI $dst,$mem" %}
3723 ins_encode %{
3724 uint vlen = vector_length(this);
3725 if (vlen <= 4) {
3726 __ movdl($dst$$XMMRegister, $mem$$Address);
3727 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3728 } else {
3729 assert(VM_Version::supports_avx2(), "sanity");
3730 int vlen_enc = vector_length_encoding(this);
3731 __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3732 }
3733 %}
3734 ins_pipe( pipe_slow );
3735 %}
3736
3737 instruct ReplI_imm(vec dst, immI con) %{
3738 match(Set dst (ReplicateI con));
3739 format %{ "replicateI $dst,$con" %}
3740 ins_encode %{
3741 uint vlen = vector_length(this);
3742 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3743 if (vlen <= 4) {
3744 __ movq($dst$$XMMRegister, const_addr);
3745 if (vlen == 4) {
3746 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3747 }
3748 } else {
3749 assert(VM_Version::supports_avx2(), "sanity");
3750 int vlen_enc = vector_length_encoding(this);
3751 __ movq($dst$$XMMRegister, const_addr);
3752 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3753 }
3754 %}
3755 ins_pipe( pipe_slow );
3756 %}
3757
3758 // Replicate integer (4 byte) scalar zero to be vector
3759 instruct ReplI_zero(vec dst, immI_0 zero) %{
3760 match(Set dst (ReplicateI zero));
3761 format %{ "replicateI $dst,$zero" %}
3762 ins_encode %{
3763 uint vlen = vector_length(this);
3764 if (vlen <= 4) {
3765 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3766 } else {
3767 int vlen_enc = vector_length_encoding(this);
3768 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3769 }
3770 %}
3771 ins_pipe( fpu_reg_reg );
3772 %}
3773
3774 // ====================ReplicateL=======================================
3775
3776 #ifdef _LP64
3777 // Replicate long (8 byte) scalar to be vector
3778 instruct ReplL_reg(vec dst, rRegL src) %{
3779 match(Set dst (ReplicateL src));
3781 ins_encode %{
3782 uint vlen = vector_length(this);
3783 if (vlen == 2) {
3784 __ movdq($dst$$XMMRegister, $src$$Register);
3785 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3786 } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3787 int vlen_enc = vector_length_encoding(this);
3788 __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3789 } else {
3790 assert(vlen == 4, "sanity");
3791 __ movdq($dst$$XMMRegister, $src$$Register);
3792 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3793 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3794 }
3795 %}
3796 ins_pipe( pipe_slow );
3797 %}
3798 #else // _LP64
3799 // Replicate long (8 byte) scalar to be vector
3800 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3801 predicate(vector_length(n) <= 4);
3802 match(Set dst (ReplicateL src));
3803 effect(TEMP dst, USE src, TEMP tmp);
3804 format %{ "replicateL $dst,$src" %}
3805 ins_encode %{
3806 uint vlen = vector_length(this);
3807 if (vlen == 2) {
3808 __ movdl($dst$$XMMRegister, $src$$Register);
3809 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3810 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3811 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3812 } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3813 int vlen_enc = Assembler::AVX_256bit;
3814 __ movdl($dst$$XMMRegister, $src$$Register);
3815 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3816 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3817 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3818 } else {
3819 __ movdl($dst$$XMMRegister, $src$$Register);
3820 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3821 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3822 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3823 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3824 }
3825 %}
3826 ins_pipe( pipe_slow );
3827 %}
3828
3829 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3830 predicate(vector_length(n) == 8);
3831 match(Set dst (ReplicateL src));
3832 effect(TEMP dst, USE src, TEMP tmp);
3833 format %{ "replicateL $dst,$src" %}
3834 ins_encode %{
3835 if (VM_Version::supports_avx512vl()) {
3836 __ movdl($dst$$XMMRegister, $src$$Register);
3837 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3838 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3839 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3840 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3841 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3842 } else {
3843 int vlen_enc = Assembler::AVX_512bit;
3844 __ movdl($dst$$XMMRegister, $src$$Register);
3845 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3846 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3847 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3848 }
3849 %}
3850 ins_pipe( pipe_slow );
3851 %}
3852 #endif // _LP64
3853
3854 instruct ReplL_mem(vec dst, memory mem) %{
3855 match(Set dst (ReplicateL (LoadL mem)));
3856 format %{ "replicateL $dst,$mem" %}
3857 ins_encode %{
3858 uint vlen = vector_length(this);
3859 if (vlen == 2) {
3860 __ movq($dst$$XMMRegister, $mem$$Address);
3861 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3862 } else {
3863 assert(VM_Version::supports_avx2(), "sanity");
3864 int vlen_enc = vector_length_encoding(this);
3865 __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
3866 }
3867 %}
3896 if (vlen == 2) {
3897 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3898 } else {
3899 int vlen_enc = vector_length_encoding(this);
3900 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3901 }
3902 %}
3903 ins_pipe( fpu_reg_reg );
3904 %}
3905
3906 // ====================ReplicateF=======================================
3907
3908 instruct ReplF_reg(vec dst, vlRegF src) %{
3909 match(Set dst (ReplicateF src));
3910 format %{ "replicateF $dst,$src" %}
3911 ins_encode %{
3912 uint vlen = vector_length(this);
3913 if (vlen <= 4) {
3914 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3915 } else if (VM_Version::supports_avx2()) {
3916 int vlen_enc = vector_length_encoding(this);
3917 __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
3918 } else {
3919 assert(vlen == 8, "sanity");
3920 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3921 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3922 }
3923 %}
3924 ins_pipe( pipe_slow );
3925 %}
3926
3927 instruct ReplF_mem(vec dst, memory mem) %{
3928 match(Set dst (ReplicateF (LoadF mem)));
3929 format %{ "replicateF $dst,$mem" %}
3930 ins_encode %{
3931 uint vlen = vector_length(this);
3932 if (vlen <= 4) {
3933 __ movdl($dst$$XMMRegister, $mem$$Address);
3934 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3935 } else {
3936 assert(VM_Version::supports_avx(), "sanity");
3937 int vlen_enc = vector_length_encoding(this);
3938 __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
3939 }
3940 %}
3941 ins_pipe( pipe_slow );
3942 %}
3943
3944 instruct ReplF_zero(vec dst, immF0 zero) %{
3945 match(Set dst (ReplicateF zero));
3946 format %{ "replicateF $dst,$zero" %}
3947 ins_encode %{
3948 uint vlen = vector_length(this);
3949 if (vlen <= 4) {
3950 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3951 } else {
3952 int vlen_enc = vector_length_encoding(this);
3953 __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
3954 }
3955 %}
3956 ins_pipe( fpu_reg_reg );
3957 %}
3958
3959 // ====================ReplicateD=======================================
3960
3961 // Replicate double (8 bytes) scalar to be vector
3962 instruct ReplD_reg(vec dst, vlRegD src) %{
3963 match(Set dst (ReplicateD src));
3964 format %{ "replicateD $dst,$src" %}
3965 ins_encode %{
3966 uint vlen = vector_length(this);
3967 if (vlen == 2) {
3968 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3969 } else if (VM_Version::supports_avx2()) {
3970 int vlen_enc = vector_length_encoding(this);
3971 __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
3972 } else {
3973 assert(vlen == 4, "sanity");
3974 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3975 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3976 }
3977 %}
3978 ins_pipe( pipe_slow );
3979 %}
3980
3981 instruct ReplD_mem(vec dst, memory mem) %{
3982 match(Set dst (ReplicateD (LoadD mem)));
3983 format %{ "replicateD $dst,$mem" %}
3984 ins_encode %{
3985 uint vlen = vector_length(this);
3986 if (vlen == 2) {
3987 __ movq($dst$$XMMRegister, $mem$$Address);
3988 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
3989 } else {
3990 assert(VM_Version::supports_avx(), "sanity");
3991 int vlen_enc = vector_length_encoding(this);
3992 __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3993 }
3994 %}
3995 ins_pipe( pipe_slow );
3996 %}
3997
3998 instruct ReplD_zero(vec dst, immD0 zero) %{
3999 match(Set dst (ReplicateD zero));
4000 format %{ "replicateD $dst,$zero" %}
4001 ins_encode %{
4002 uint vlen = vector_length(this);
4003 if (vlen == 2) {
4004 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4005 } else {
4006 int vlen_enc = vector_length_encoding(this);
4007 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4008 }
4009 %}
4010 ins_pipe( fpu_reg_reg );
4011 %}
4012
4013 // ====================VECTOR INSERT=======================================
4014
4015 instruct insert(vec dst, rRegI val, immU8 idx) %{
4016 predicate(vector_length_in_bytes(n) >= 8 &&
4017 vector_length_in_bytes(n) <= 16);
4018 match(Set dst (VectorInsert (Binary dst val) idx));
4019 format %{ "vector_insert $dst,$val,$idx" %}
4020 ins_encode %{
4021 assert(UseSSE >= 4, "required");
4022
4023 BasicType elem_bt = vector_element_basic_type(this);
4024
4025 assert(is_integral_type(elem_bt), "");
4026 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4027
4028 __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4029 %}
4030 ins_pipe( pipe_slow );
4031 %}
4032
4033 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4034 predicate(vector_length_in_bytes(n) == 32);
4035 match(Set dst (VectorInsert (Binary src val) idx));
4036 effect(TEMP vtmp);
4037 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4038 ins_encode %{
4039 int vlen_enc = Assembler::AVX_256bit;
4040 BasicType elem_bt = vector_element_basic_type(this);
4041 int elem_per_lane = 16/type2aelembytes(elem_bt);
4042 int log2epr = log2(elem_per_lane);
4043
4044 assert(is_integral_type(elem_bt), "sanity");
4045 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4046
4047 uint x_idx = $idx$$constant & right_n_bits(log2epr);
4048 uint y_idx = ($idx$$constant >> log2epr) & 1;
4049 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4050 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4051 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4052 %}
4053 ins_pipe( pipe_slow );
4054 %}
4055
4056 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4057 predicate(vector_length_in_bytes(n) == 64);
4058 match(Set dst (VectorInsert (Binary src val) idx));
4059 effect(TEMP vtmp);
4060 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4061 ins_encode %{
4062 assert(UseAVX > 2, "sanity");
4063
4064 BasicType elem_bt = vector_element_basic_type(this);
4065 int elem_per_lane = 16/type2aelembytes(elem_bt);
4066 int log2epr = log2(elem_per_lane);
4067
4068 assert(is_integral_type(elem_bt), "");
4069 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4070
4071 uint x_idx = $idx$$constant & right_n_bits(log2epr);
4072 uint y_idx = ($idx$$constant >> log2epr) & 3;
4073 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4074 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4075 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4076 %}
4077 ins_pipe( pipe_slow );
4078 %}
4079
4080 #ifdef _LP64
4081 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4082 predicate(vector_length(n) == 2);
4083 match(Set dst (VectorInsert (Binary dst val) idx));
4084 format %{ "vector_insert $dst,$val,$idx" %}
4085 ins_encode %{
4086 assert(UseSSE >= 4, "required");
4087 assert(vector_element_basic_type(this) == T_LONG, "");
4088 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4089
4090 __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4091 %}
4092 ins_pipe( pipe_slow );
4093 %}
4094
4095 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4096 predicate(vector_length(n) == 4);
4097 match(Set dst (VectorInsert (Binary src val) idx));
4098 effect(TEMP vtmp);
4099 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4100 ins_encode %{
4101 assert(vector_element_basic_type(this) == T_LONG, "");
4102 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4103
4104 uint x_idx = $idx$$constant & right_n_bits(1);
4105 uint y_idx = ($idx$$constant >> 1) & 1;
4106 int vlen_enc = Assembler::AVX_256bit;
4107 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4108 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4109 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4110 %}
4111 ins_pipe( pipe_slow );
4112 %}
4113
4114 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4115 predicate(vector_length(n) == 8);
4116 match(Set dst (VectorInsert (Binary src val) idx));
4117 effect(TEMP vtmp);
4118 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4119 ins_encode %{
4120 assert(vector_element_basic_type(this) == T_LONG, "sanity");
4121 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4122
4123 uint x_idx = $idx$$constant & right_n_bits(1);
4124 uint y_idx = ($idx$$constant >> 1) & 3;
4125 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4126 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4127 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4128 %}
4129 ins_pipe( pipe_slow );
4130 %}
4131 #endif
4132
4133 instruct insertF(vec dst, regF val, immU8 idx) %{
4134 predicate(vector_length(n) >= 2 &&
4135 vector_length(n) <= 4);
4136 match(Set dst (VectorInsert (Binary dst val) idx));
4137 format %{ "vector_insert $dst,$val,$idx" %}
4138 ins_encode %{
4139 assert(UseSSE >= 4, "sanity");
4140
4141 assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4142 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4143
4144 __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4145 %}
4146 ins_pipe( pipe_slow );
4147 %}
4148
4149 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4150 predicate(vector_length(n) >= 8);
4151 match(Set dst (VectorInsert (Binary src val) idx));
4152 effect(TEMP vtmp);
4153 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4154 ins_encode %{
4155 assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4156 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4157
4158 int vlen = vector_length(this);
4159 uint x_idx = $idx$$constant & right_n_bits(2);
4160 if (vlen == 8) {
4161 uint y_idx = ($idx$$constant >> 2) & 1;
4162 int vlen_enc = Assembler::AVX_256bit;
4163 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4164 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4165 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4166 } else {
4167 assert(vlen == 16, "sanity");
4168 uint y_idx = ($idx$$constant >> 2) & 3;
4169 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4170 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4171 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4172 }
4173 %}
4174 ins_pipe( pipe_slow );
4175 %}
4176
4177 #ifdef _LP64
4178 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4179 predicate(vector_length(n) == 2);
4180 match(Set dst (VectorInsert (Binary dst val) idx));
4181 effect(TEMP tmp);
4182 format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4183 ins_encode %{
4184 assert(UseSSE >= 4, "sanity");
4185 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4186 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4187
4188 __ movq($tmp$$Register, $val$$XMMRegister);
4189 __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4190 %}
4191 ins_pipe( pipe_slow );
4192 %}
4193
4194 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4195 predicate(vector_length(n) == 4);
4196 match(Set dst (VectorInsert (Binary src val) idx));
4197 effect(TEMP vtmp, TEMP tmp);
4198 format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4199 ins_encode %{
4200 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4201 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4202
4203 uint x_idx = $idx$$constant & right_n_bits(1);
4204 uint y_idx = ($idx$$constant >> 1) & 1;
4205 int vlen_enc = Assembler::AVX_256bit;
4206 __ movq($tmp$$Register, $val$$XMMRegister);
4207 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4208 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4209 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4210 %}
4211 ins_pipe( pipe_slow );
4212 %}
4213
4214 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4215 predicate(vector_length(n) == 8);
4216 match(Set dst (VectorInsert (Binary src val) idx));
4217 effect(TEMP tmp, TEMP vtmp);
4218 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4219 ins_encode %{
4220 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4221 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4222
4223 uint x_idx = $idx$$constant & right_n_bits(1);
4224 uint y_idx = ($idx$$constant >> 1) & 3;
4225 __ movq($tmp$$Register, $val$$XMMRegister);
4226 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4227 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4228 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4229 %}
4230 ins_pipe( pipe_slow );
4231 %}
4232 #endif
4233
4234 // ====================REDUCTION ARITHMETIC=======================================
4235
4236 // =======================Int Reduction==========================================
4237
4238 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4239 predicate(vector_element_basic_type(n->in(2)) == T_INT &&
4240 vector_length(n->in(2)) < 16); // src2
4241 match(Set dst (AddReductionVI src1 src2));
4242 match(Set dst (MulReductionVI src1 src2));
4243 match(Set dst (AndReductionV src1 src2));
4244 match(Set dst ( OrReductionV src1 src2));
4245 match(Set dst (XorReductionV src1 src2));
4246 match(Set dst (MinReductionV src1 src2));
4247 match(Set dst (MaxReductionV src1 src2));
4248 effect(TEMP vtmp1, TEMP vtmp2);
4249 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4250 ins_encode %{
4251 int opcode = this->ideal_Opcode();
4252 int vlen = vector_length(this, $src2);
4253 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4254 %}
4255 ins_pipe( pipe_slow );
4256 %}
4257
4258 instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4259 predicate(vector_element_basic_type(n->in(2)) == T_INT &&
4260 vector_length(n->in(2)) == 16); // src2
4261 match(Set dst (AddReductionVI src1 src2));
4262 match(Set dst (MulReductionVI src1 src2));
4263 match(Set dst (AndReductionV src1 src2));
4264 match(Set dst ( OrReductionV src1 src2));
4265 match(Set dst (XorReductionV src1 src2));
4266 match(Set dst (MinReductionV src1 src2));
4267 match(Set dst (MaxReductionV src1 src2));
4268 effect(TEMP vtmp1, TEMP vtmp2);
4269 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4270 ins_encode %{
4271 int opcode = this->ideal_Opcode();
4272 int vlen = vector_length(this, $src2);
4273 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4274 %}
4275 ins_pipe( pipe_slow );
4276 %}
4277
4278 // =======================Long Reduction==========================================
4279
4280 #ifdef _LP64
4281 instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4282 predicate(vector_element_basic_type(n->in(2)) == T_LONG &&
4283 vector_length(n->in(2)) < 8); // src2
4284 match(Set dst (AddReductionVL src1 src2));
4285 match(Set dst (MulReductionVL src1 src2));
4286 match(Set dst (AndReductionV src1 src2));
4287 match(Set dst ( OrReductionV src1 src2));
4288 match(Set dst (XorReductionV src1 src2));
4289 match(Set dst (MinReductionV src1 src2));
4290 match(Set dst (MaxReductionV src1 src2));
4291 effect(TEMP vtmp1, TEMP vtmp2);
4292 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4293 ins_encode %{
4294 int opcode = this->ideal_Opcode();
4295 int vlen = vector_length(this, $src2);
4296 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4297 %}
4298 ins_pipe( pipe_slow );
4299 %}
4300
4301 instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4302 predicate(vector_element_basic_type(n->in(2)) == T_LONG &&
4303 vector_length(n->in(2)) == 8); // src2
4304 match(Set dst (AddReductionVL src1 src2));
4305 match(Set dst (MulReductionVL src1 src2));
4306 match(Set dst (AndReductionV src1 src2));
4307 match(Set dst ( OrReductionV src1 src2));
4308 match(Set dst (XorReductionV src1 src2));
4309 match(Set dst (MinReductionV src1 src2));
4310 match(Set dst (MaxReductionV src1 src2));
4311 effect(TEMP vtmp1, TEMP vtmp2);
4312 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4313 ins_encode %{
4314 int opcode = this->ideal_Opcode();
4315 int vlen = vector_length(this, $src2);
4316 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4317 %}
4318 ins_pipe( pipe_slow );
4319 %}
4320 #endif // _LP64
4321
4322 // =======================Float Reduction==========================================
4323
4324 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4325 predicate(vector_length(n->in(2)) <= 4); // src
4326 match(Set dst (AddReductionVF dst src));
4327 match(Set dst (MulReductionVF dst src));
4328 effect(TEMP dst, TEMP vtmp);
4329 format %{ "vector_reduction_float $dst,$src ; using $vtmp as TEMP" %}
4330 ins_encode %{
4331 int opcode = this->ideal_Opcode();
4332 int vlen = vector_length(this, $src);
4333 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4334 %}
4335 ins_pipe( pipe_slow );
4336 %}
4337
4338 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4339 predicate(vector_length(n->in(2)) == 8); // src
4340 match(Set dst (AddReductionVF dst src));
4341 match(Set dst (MulReductionVF dst src));
4342 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4343 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4344 ins_encode %{
4345 int opcode = this->ideal_Opcode();
4346 int vlen = vector_length(this, $src);
4347 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4348 %}
4349 ins_pipe( pipe_slow );
4350 %}
4351
4352 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4353 predicate(vector_length(n->in(2)) == 16); // src
4354 match(Set dst (AddReductionVF dst src));
4355 match(Set dst (MulReductionVF dst src));
4356 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4357 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4358 ins_encode %{
4359 int opcode = this->ideal_Opcode();
4360 int vlen = vector_length(this, $src);
4361 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4362 %}
4363 ins_pipe( pipe_slow );
4364 %}
4365
4366 // =======================Double Reduction==========================================
4367
4368 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4369 predicate(vector_length(n->in(2)) == 2); // src
4370 match(Set dst (AddReductionVD dst src));
4371 match(Set dst (MulReductionVD dst src));
4372 effect(TEMP dst, TEMP vtmp);
4373 format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4374 ins_encode %{
4375 int opcode = this->ideal_Opcode();
4376 int vlen = vector_length(this, $src);
4377 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4378 %}
4379 ins_pipe( pipe_slow );
4380 %}
4381
4382 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4383 predicate(vector_length(n->in(2)) == 4); // src
4384 match(Set dst (AddReductionVD dst src));
4385 match(Set dst (MulReductionVD dst src));
4386 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4387 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4388 ins_encode %{
4389 int opcode = this->ideal_Opcode();
4390 int vlen = vector_length(this, $src);
4391 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4392 %}
4393 ins_pipe( pipe_slow );
4394 %}
4395
4396 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4397 predicate(vector_length(n->in(2)) == 8); // src
4398 match(Set dst (AddReductionVD dst src));
4399 match(Set dst (MulReductionVD dst src));
4400 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4401 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4402 ins_encode %{
4403 int opcode = this->ideal_Opcode();
4404 int vlen = vector_length(this, $src);
4405 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4406 %}
4407 ins_pipe( pipe_slow );
4408 %}
4409
4410 // =======================Byte Reduction==========================================
4411
4412 #ifdef _LP64
4413 instruct reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4414 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4415 vector_length(n->in(2)) <= 32); // src2
4416 match(Set dst (AddReductionVI src1 src2));
4417 match(Set dst (AndReductionV src1 src2));
4418 match(Set dst ( OrReductionV src1 src2));
4419 match(Set dst (XorReductionV src1 src2));
4420 match(Set dst (MinReductionV src1 src2));
4421 match(Set dst (MaxReductionV src1 src2));
4422 effect(TEMP vtmp1, TEMP vtmp2);
4423 format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4424 ins_encode %{
4425 int opcode = this->ideal_Opcode();
4426 int vlen = vector_length(this, $src2);
4427 __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4428 %}
4429 ins_pipe( pipe_slow );
4430 %}
4431
4432 instruct reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4433 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4434 vector_length(n->in(2)) == 64); // src2
4435 match(Set dst (AddReductionVI src1 src2));
4436 match(Set dst (AndReductionV src1 src2));
4437 match(Set dst ( OrReductionV src1 src2));
4438 match(Set dst (XorReductionV src1 src2));
4439 match(Set dst (MinReductionV src1 src2));
4440 match(Set dst (MaxReductionV src1 src2));
4441 effect(TEMP vtmp1, TEMP vtmp2);
4442 format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4443 ins_encode %{
4444 int opcode = this->ideal_Opcode();
4445 int vlen = vector_length(this, $src2);
4446 __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4447 %}
4448 ins_pipe( pipe_slow );
4449 %}
4450 #endif
4451
4452 // =======================Short Reduction==========================================
4453
4454 instruct reductionS(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4455 predicate(vector_element_basic_type(n->in(2)) == T_SHORT &&
4456 vector_length(n->in(2)) <= 16); // src2
4457 match(Set dst (AddReductionVI src1 src2));
4458 match(Set dst (MulReductionVI src1 src2));
4459 match(Set dst (AndReductionV src1 src2));
4460 match(Set dst ( OrReductionV src1 src2));
4461 match(Set dst (XorReductionV src1 src2));
4462 match(Set dst (MinReductionV src1 src2));
4463 match(Set dst (MaxReductionV src1 src2));
4464 effect(TEMP vtmp1, TEMP vtmp2);
4465 format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4466 ins_encode %{
4467 int opcode = this->ideal_Opcode();
4468 int vlen = vector_length(this, $src2);
4469 __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4470 %}
4471 ins_pipe( pipe_slow );
4472 %}
4473
4474 instruct reduction32S(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4475 predicate(vector_element_basic_type(n->in(2)) == T_SHORT &&
4476 vector_length(n->in(2)) == 32); // src2
4477 match(Set dst (AddReductionVI src1 src2));
4478 match(Set dst (MulReductionVI src1 src2));
4479 match(Set dst (AndReductionV src1 src2));
4480 match(Set dst ( OrReductionV src1 src2));
4481 match(Set dst (XorReductionV src1 src2));
4482 match(Set dst (MinReductionV src1 src2));
4483 match(Set dst (MaxReductionV src1 src2));
4484 effect(TEMP vtmp1, TEMP vtmp2);
4485 format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4486 ins_encode %{
4487 int opcode = this->ideal_Opcode();
4488 int vlen = vector_length(this, $src2);
4489 __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4490 %}
4491 ins_pipe( pipe_slow );
4492 %}
4493
4494 // =======================Mul Reduction==========================================
4495
4496 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4497 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4498 vector_length(n->in(2)) <= 32); // src2
4499 match(Set dst (MulReductionVI src1 src2));
4500 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4501 format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4502 ins_encode %{
4503 int opcode = this->ideal_Opcode();
4504 int vlen = vector_length(this, $src2);
4505 __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4506 %}
4507 ins_pipe( pipe_slow );
4508 %}
4509
4510 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4511 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4512 vector_length(n->in(2)) == 64); // src2
4513 match(Set dst (MulReductionVI src1 src2));
4514 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4515 format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4516 ins_encode %{
4517 int opcode = this->ideal_Opcode();
4518 int vlen = vector_length(this, $src2);
4519 __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4520 %}
4521 ins_pipe( pipe_slow );
4522 %}
4523
4524 //--------------------Min/Max Float Reduction --------------------
4525 // Float Min Reduction
4526 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4527 legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4528 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4529 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4530 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4531 vector_length(n->in(2)) == 2);
4532 match(Set dst (MinReductionV src1 src2));
4533 match(Set dst (MaxReductionV src1 src2));
4534 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4535 format %{ "vector_minmax2F_reduction $dst,$src1,$src2 ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4536 ins_encode %{
4537 assert(UseAVX > 0, "sanity");
4538
4539 int opcode = this->ideal_Opcode();
4540 int vlen = vector_length(this, $src2);
4541 __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4542 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4543 %}
4544 ins_pipe( pipe_slow );
4545 %}
4546
4547 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4548 legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4549 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4550 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4551 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4552 vector_length(n->in(2)) >= 4);
4553 match(Set dst (MinReductionV src1 src2));
4554 match(Set dst (MaxReductionV src1 src2));
4555 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4556 format %{ "vector_minmaxF_reduction $dst,$src1,$src2 ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4557 ins_encode %{
4558 assert(UseAVX > 0, "sanity");
4559
4560 int opcode = this->ideal_Opcode();
4561 int vlen = vector_length(this, $src2);
4562 __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4563 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4564 %}
4565 ins_pipe( pipe_slow );
4566 %}
4567
4568 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4569 legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4570 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4571 vector_length(n->in(2)) == 2);
4572 match(Set dst (MinReductionV dst src));
4573 match(Set dst (MaxReductionV dst src));
4574 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4575 format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4576 ins_encode %{
4577 assert(UseAVX > 0, "sanity");
4578
4579 int opcode = this->ideal_Opcode();
4580 int vlen = vector_length(this, $src);
4581 __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4582 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4583 %}
4584 ins_pipe( pipe_slow );
4585 %}
4586
4587
4588 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4589 legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4590 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4591 vector_length(n->in(2)) >= 4);
4592 match(Set dst (MinReductionV dst src));
4593 match(Set dst (MaxReductionV dst src));
4594 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4595 format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4596 ins_encode %{
4597 assert(UseAVX > 0, "sanity");
4598
4599 int opcode = this->ideal_Opcode();
4600 int vlen = vector_length(this, $src);
4601 __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4602 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4603 %}
4604 ins_pipe( pipe_slow );
4605 %}
4606
4607
4608 //--------------------Min Double Reduction --------------------
4609 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4610 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4611 rFlagsReg cr) %{
4612 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4613 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4614 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4615 vector_length(n->in(2)) == 2);
4616 match(Set dst (MinReductionV src1 src2));
4617 match(Set dst (MaxReductionV src1 src2));
4618 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4619 format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4620 ins_encode %{
4621 assert(UseAVX > 0, "sanity");
4622
4623 int opcode = this->ideal_Opcode();
4624 int vlen = vector_length(this, $src2);
4625 __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4626 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4627 %}
4628 ins_pipe( pipe_slow );
4629 %}
4630
4631 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
4632 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4633 rFlagsReg cr) %{
4634 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4635 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4636 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4637 vector_length(n->in(2)) >= 4);
4638 match(Set dst (MinReductionV src1 src2));
4639 match(Set dst (MaxReductionV src1 src2));
4640 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4641 format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4642 ins_encode %{
4643 assert(UseAVX > 0, "sanity");
4644
4645 int opcode = this->ideal_Opcode();
4646 int vlen = vector_length(this, $src2);
4647 __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4648 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4649 %}
4650 ins_pipe( pipe_slow );
4651 %}
4652
4653
4654 instruct minmax_reduction2D_av(legRegD dst, legVec src,
4655 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4656 rFlagsReg cr) %{
4657 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4658 vector_length(n->in(2)) == 2);
4659 match(Set dst (MinReductionV dst src));
4660 match(Set dst (MaxReductionV dst src));
4661 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4662 format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4663 ins_encode %{
4664 assert(UseAVX > 0, "sanity");
4665
4666 int opcode = this->ideal_Opcode();
4667 int vlen = vector_length(this, $src);
4668 __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4669 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4670 %}
4671 ins_pipe( pipe_slow );
4672 %}
4673
4674 instruct minmax_reductionD_av(legRegD dst, legVec src,
4675 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4676 rFlagsReg cr) %{
4677 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4678 vector_length(n->in(2)) >= 4);
4679 match(Set dst (MinReductionV dst src));
4680 match(Set dst (MaxReductionV dst src));
4681 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4682 format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4683 ins_encode %{
4684 assert(UseAVX > 0, "sanity");
4685
4686 int opcode = this->ideal_Opcode();
4687 int vlen = vector_length(this, $src);
4688 __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4689 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4690 %}
4691 ins_pipe( pipe_slow );
4692 %}
4693
4694 // ====================VECTOR ARITHMETIC=======================================
4695
4696 // --------------------------------- ADD --------------------------------------
4697
4698 // Bytes vector add
4699 instruct vaddB(vec dst, vec src) %{
4700 predicate(UseAVX == 0);
4701 match(Set dst (AddVB dst src));
4702 format %{ "paddb $dst,$src\t! add packedB" %}
4703 ins_encode %{
4704 __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4705 %}
4706 ins_pipe( pipe_slow );
4707 %}
4708
4709 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
4710 predicate(UseAVX > 0);
4711 match(Set dst (AddVB src1 src2));
4712 format %{ "vpaddb $dst,$src1,$src2\t! add packedB" %}
4713 ins_encode %{
4714 int vlen_enc = vector_length_encoding(this);
4715 __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4716 %}
4717 ins_pipe( pipe_slow );
4718 %}
4719
4720 instruct vaddB_mem(vec dst, vec src, memory mem) %{
4721 predicate(UseAVX > 0);
4722 match(Set dst (AddVB src (LoadVector mem)));
4723 format %{ "vpaddb $dst,$src,$mem\t! add packedB" %}
4724 ins_encode %{
4725 int vlen_enc = vector_length_encoding(this);
4726 __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4727 %}
4728 ins_pipe( pipe_slow );
4729 %}
4730
4731 // Shorts/Chars vector add
4732 instruct vaddS(vec dst, vec src) %{
4733 predicate(UseAVX == 0);
4734 match(Set dst (AddVS dst src));
4735 format %{ "paddw $dst,$src\t! add packedS" %}
4736 ins_encode %{
4737 __ paddw($dst$$XMMRegister, $src$$XMMRegister);
4738 %}
4739 ins_pipe( pipe_slow );
4740 %}
4741
4742 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
4743 predicate(UseAVX > 0);
4744 match(Set dst (AddVS src1 src2));
4745 format %{ "vpaddw $dst,$src1,$src2\t! add packedS" %}
4746 ins_encode %{
4747 int vlen_enc = vector_length_encoding(this);
4748 __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4749 %}
4750 ins_pipe( pipe_slow );
4751 %}
4752
4753 instruct vaddS_mem(vec dst, vec src, memory mem) %{
4754 predicate(UseAVX > 0);
4755 match(Set dst (AddVS src (LoadVector mem)));
4756 format %{ "vpaddw $dst,$src,$mem\t! add packedS" %}
4757 ins_encode %{
4758 int vlen_enc = vector_length_encoding(this);
4759 __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4760 %}
4761 ins_pipe( pipe_slow );
4762 %}
4763
4764 // Integers vector add
4765 instruct vaddI(vec dst, vec src) %{
4766 predicate(UseAVX == 0);
4767 match(Set dst (AddVI dst src));
4768 format %{ "paddd $dst,$src\t! add packedI" %}
4769 ins_encode %{
4770 __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4771 %}
4772 ins_pipe( pipe_slow );
4773 %}
4774
4775 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4776 predicate(UseAVX > 0);
4777 match(Set dst (AddVI src1 src2));
4778 format %{ "vpaddd $dst,$src1,$src2\t! add packedI" %}
4779 ins_encode %{
4780 int vlen_enc = vector_length_encoding(this);
4781 __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4782 %}
4783 ins_pipe( pipe_slow );
4784 %}
4785
4786
4787 instruct vaddI_mem(vec dst, vec src, memory mem) %{
4788 predicate(UseAVX > 0);
4789 match(Set dst (AddVI src (LoadVector mem)));
4790 format %{ "vpaddd $dst,$src,$mem\t! add packedI" %}
4791 ins_encode %{
4792 int vlen_enc = vector_length_encoding(this);
4793 __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4794 %}
4795 ins_pipe( pipe_slow );
4796 %}
4797
4798 // Longs vector add
4799 instruct vaddL(vec dst, vec src) %{
4800 predicate(UseAVX == 0);
4801 match(Set dst (AddVL dst src));
4802 format %{ "paddq $dst,$src\t! add packedL" %}
4803 ins_encode %{
4804 __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4805 %}
4806 ins_pipe( pipe_slow );
4807 %}
4808
4809 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4810 predicate(UseAVX > 0);
4811 match(Set dst (AddVL src1 src2));
4812 format %{ "vpaddq $dst,$src1,$src2\t! add packedL" %}
4813 ins_encode %{
4814 int vlen_enc = vector_length_encoding(this);
4815 __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4816 %}
4817 ins_pipe( pipe_slow );
4818 %}
4819
4820 instruct vaddL_mem(vec dst, vec src, memory mem) %{
4821 predicate(UseAVX > 0);
4822 match(Set dst (AddVL src (LoadVector mem)));
4823 format %{ "vpaddq $dst,$src,$mem\t! add packedL" %}
4824 ins_encode %{
4825 int vlen_enc = vector_length_encoding(this);
4826 __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4827 %}
4828 ins_pipe( pipe_slow );
4829 %}
4830
4831 // Floats vector add
4832 instruct vaddF(vec dst, vec src) %{
4833 predicate(UseAVX == 0);
4834 match(Set dst (AddVF dst src));
4835 format %{ "addps $dst,$src\t! add packedF" %}
4836 ins_encode %{
4837 __ addps($dst$$XMMRegister, $src$$XMMRegister);
4838 %}
4839 ins_pipe( pipe_slow );
4840 %}
4841
4842 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
4843 predicate(UseAVX > 0);
4844 match(Set dst (AddVF src1 src2));
4845 format %{ "vaddps $dst,$src1,$src2\t! add packedF" %}
4846 ins_encode %{
4847 int vlen_enc = vector_length_encoding(this);
4848 __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4849 %}
4850 ins_pipe( pipe_slow );
4851 %}
4852
4853 instruct vaddF_mem(vec dst, vec src, memory mem) %{
4854 predicate(UseAVX > 0);
4855 match(Set dst (AddVF src (LoadVector mem)));
4856 format %{ "vaddps $dst,$src,$mem\t! add packedF" %}
4857 ins_encode %{
4858 int vlen_enc = vector_length_encoding(this);
4859 __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4860 %}
4861 ins_pipe( pipe_slow );
4862 %}
4863
4864 // Doubles vector add
4865 instruct vaddD(vec dst, vec src) %{
4866 predicate(UseAVX == 0);
4867 match(Set dst (AddVD dst src));
4868 format %{ "addpd $dst,$src\t! add packedD" %}
4869 ins_encode %{
4870 __ addpd($dst$$XMMRegister, $src$$XMMRegister);
4871 %}
4872 ins_pipe( pipe_slow );
4873 %}
4874
4875 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
4876 predicate(UseAVX > 0);
4877 match(Set dst (AddVD src1 src2));
4878 format %{ "vaddpd $dst,$src1,$src2\t! add packedD" %}
4879 ins_encode %{
4880 int vlen_enc = vector_length_encoding(this);
4881 __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4882 %}
4883 ins_pipe( pipe_slow );
4884 %}
4885
4886 instruct vaddD_mem(vec dst, vec src, memory mem) %{
4887 predicate(UseAVX > 0);
4888 match(Set dst (AddVD src (LoadVector mem)));
4889 format %{ "vaddpd $dst,$src,$mem\t! add packedD" %}
4890 ins_encode %{
4891 int vlen_enc = vector_length_encoding(this);
4892 __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4893 %}
4894 ins_pipe( pipe_slow );
4895 %}
4896
4897 // --------------------------------- SUB --------------------------------------
4898
4899 // Bytes vector sub
4900 instruct vsubB(vec dst, vec src) %{
4901 predicate(UseAVX == 0);
4902 match(Set dst (SubVB dst src));
4903 format %{ "psubb $dst,$src\t! sub packedB" %}
4904 ins_encode %{
4905 __ psubb($dst$$XMMRegister, $src$$XMMRegister);
4906 %}
4907 ins_pipe( pipe_slow );
4908 %}
4909
4910 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
4911 predicate(UseAVX > 0);
4912 match(Set dst (SubVB src1 src2));
4913 format %{ "vpsubb $dst,$src1,$src2\t! sub packedB" %}
4914 ins_encode %{
4915 int vlen_enc = vector_length_encoding(this);
4916 __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4917 %}
4918 ins_pipe( pipe_slow );
4919 %}
4920
4921 instruct vsubB_mem(vec dst, vec src, memory mem) %{
4922 predicate(UseAVX > 0);
4923 match(Set dst (SubVB src (LoadVector mem)));
4924 format %{ "vpsubb $dst,$src,$mem\t! sub packedB" %}
4925 ins_encode %{
4926 int vlen_enc = vector_length_encoding(this);
4927 __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4928 %}
4929 ins_pipe( pipe_slow );
4930 %}
4931
4932 // Shorts/Chars vector sub
4933 instruct vsubS(vec dst, vec src) %{
4934 predicate(UseAVX == 0);
4935 match(Set dst (SubVS dst src));
4936 format %{ "psubw $dst,$src\t! sub packedS" %}
4937 ins_encode %{
4938 __ psubw($dst$$XMMRegister, $src$$XMMRegister);
4939 %}
4940 ins_pipe( pipe_slow );
4941 %}
4942
4943
4944 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
4945 predicate(UseAVX > 0);
4946 match(Set dst (SubVS src1 src2));
4947 format %{ "vpsubw $dst,$src1,$src2\t! sub packedS" %}
4948 ins_encode %{
4949 int vlen_enc = vector_length_encoding(this);
4950 __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4951 %}
4952 ins_pipe( pipe_slow );
4953 %}
4954
4955 instruct vsubS_mem(vec dst, vec src, memory mem) %{
4956 predicate(UseAVX > 0);
4957 match(Set dst (SubVS src (LoadVector mem)));
4958 format %{ "vpsubw $dst,$src,$mem\t! sub packedS" %}
4959 ins_encode %{
4960 int vlen_enc = vector_length_encoding(this);
4961 __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4962 %}
4963 ins_pipe( pipe_slow );
4964 %}
4965
4966 // Integers vector sub
4967 instruct vsubI(vec dst, vec src) %{
4968 predicate(UseAVX == 0);
4969 match(Set dst (SubVI dst src));
4970 format %{ "psubd $dst,$src\t! sub packedI" %}
4971 ins_encode %{
4972 __ psubd($dst$$XMMRegister, $src$$XMMRegister);
4973 %}
4974 ins_pipe( pipe_slow );
4975 %}
4976
4977 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
4978 predicate(UseAVX > 0);
4979 match(Set dst (SubVI src1 src2));
4980 format %{ "vpsubd $dst,$src1,$src2\t! sub packedI" %}
4981 ins_encode %{
4982 int vlen_enc = vector_length_encoding(this);
4983 __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4984 %}
4985 ins_pipe( pipe_slow );
4986 %}
4987
4988 instruct vsubI_mem(vec dst, vec src, memory mem) %{
4989 predicate(UseAVX > 0);
4990 match(Set dst (SubVI src (LoadVector mem)));
4991 format %{ "vpsubd $dst,$src,$mem\t! sub packedI" %}
4992 ins_encode %{
4993 int vlen_enc = vector_length_encoding(this);
4994 __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4995 %}
4996 ins_pipe( pipe_slow );
4997 %}
4998
4999 // Longs vector sub
5000 instruct vsubL(vec dst, vec src) %{
5001 predicate(UseAVX == 0);
5002 match(Set dst (SubVL dst src));
5003 format %{ "psubq $dst,$src\t! sub packedL" %}
5004 ins_encode %{
5005 __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5006 %}
5007 ins_pipe( pipe_slow );
5008 %}
5009
5010 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5011 predicate(UseAVX > 0);
5012 match(Set dst (SubVL src1 src2));
5013 format %{ "vpsubq $dst,$src1,$src2\t! sub packedL" %}
5014 ins_encode %{
5015 int vlen_enc = vector_length_encoding(this);
5016 __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5017 %}
5018 ins_pipe( pipe_slow );
5019 %}
5020
5021
5022 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5023 predicate(UseAVX > 0);
5024 match(Set dst (SubVL src (LoadVector mem)));
5025 format %{ "vpsubq $dst,$src,$mem\t! sub packedL" %}
5026 ins_encode %{
5027 int vlen_enc = vector_length_encoding(this);
5028 __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5029 %}
5030 ins_pipe( pipe_slow );
5031 %}
5032
5033 // Floats vector sub
5034 instruct vsubF(vec dst, vec src) %{
5035 predicate(UseAVX == 0);
5036 match(Set dst (SubVF dst src));
5037 format %{ "subps $dst,$src\t! sub packedF" %}
5038 ins_encode %{
5039 __ subps($dst$$XMMRegister, $src$$XMMRegister);
5040 %}
5041 ins_pipe( pipe_slow );
5042 %}
5043
5044 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5045 predicate(UseAVX > 0);
5046 match(Set dst (SubVF src1 src2));
5047 format %{ "vsubps $dst,$src1,$src2\t! sub packedF" %}
5048 ins_encode %{
5049 int vlen_enc = vector_length_encoding(this);
5050 __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5051 %}
5052 ins_pipe( pipe_slow );
5053 %}
5054
5055 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5056 predicate(UseAVX > 0);
5057 match(Set dst (SubVF src (LoadVector mem)));
5058 format %{ "vsubps $dst,$src,$mem\t! sub packedF" %}
5059 ins_encode %{
5060 int vlen_enc = vector_length_encoding(this);
5061 __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5062 %}
5063 ins_pipe( pipe_slow );
5064 %}
5065
5066 // Doubles vector sub
5067 instruct vsubD(vec dst, vec src) %{
5068 predicate(UseAVX == 0);
5069 match(Set dst (SubVD dst src));
5070 format %{ "subpd $dst,$src\t! sub packedD" %}
5071 ins_encode %{
5072 __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5073 %}
5074 ins_pipe( pipe_slow );
5075 %}
5076
5077 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5078 predicate(UseAVX > 0);
5079 match(Set dst (SubVD src1 src2));
5080 format %{ "vsubpd $dst,$src1,$src2\t! sub packedD" %}
5081 ins_encode %{
5082 int vlen_enc = vector_length_encoding(this);
5083 __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5084 %}
5085 ins_pipe( pipe_slow );
5086 %}
5087
5088 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5089 predicate(UseAVX > 0);
5090 match(Set dst (SubVD src (LoadVector mem)));
5091 format %{ "vsubpd $dst,$src,$mem\t! sub packedD" %}
5092 ins_encode %{
5093 int vlen_enc = vector_length_encoding(this);
5094 __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5095 %}
5096 ins_pipe( pipe_slow );
5097 %}
5098
5099 // --------------------------------- MUL --------------------------------------
5100
5101 // Byte vector mul
5102 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5103 predicate(vector_length(n) == 4 ||
5104 vector_length(n) == 8);
5105 match(Set dst (MulVB src1 src2));
5106 effect(TEMP dst, TEMP tmp, TEMP scratch);
5107 format %{"vector_mulB $dst,$src1,$src2" %}
5108 ins_encode %{
5109 assert(UseSSE > 3, "required");
5110 __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5111 __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5112 __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5113 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5114 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5115 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5116 %}
5117 ins_pipe( pipe_slow );
5118 %}
5119
5120 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5121 predicate(vector_length(n) == 16 && UseAVX <= 1);
5122 match(Set dst (MulVB src1 src2));
5123 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5124 format %{"vector_mulB $dst,$src1,$src2" %}
5125 ins_encode %{
5126 assert(UseSSE > 3, "required");
5127 __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5128 __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5129 __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5130 __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5131 __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5132 __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5133 __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5134 __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5135 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5136 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5137 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5138 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5139 %}
5140 ins_pipe( pipe_slow );
5141 %}
5142
5143 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5144 predicate(vector_length(n) == 16 && UseAVX > 1);
5145 match(Set dst (MulVB src1 src2));
5146 effect(TEMP dst, TEMP tmp, TEMP scratch);
5147 format %{"vector_mulB $dst,$src1,$src2" %}
5148 ins_encode %{
5149 int vlen_enc = Assembler::AVX_256bit;
5150 __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5151 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5152 __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5153 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5154 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5155 __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5156 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5157 %}
5158 ins_pipe( pipe_slow );
5159 %}
5160
5161 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5162 predicate(vector_length(n) == 32);
5163 match(Set dst (MulVB src1 src2));
5164 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5165 format %{"vector_mulB $dst,$src1,$src2" %}
5166 ins_encode %{
5167 assert(UseAVX > 1, "required");
5168 int vlen_enc = Assembler::AVX_256bit;
5169 __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5170 __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5171 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5172 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5173 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5174 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5175 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5176 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5177 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5178 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5179 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5180 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5181 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5182 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5183 %}
5184 ins_pipe( pipe_slow );
5185 %}
5186
5187 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5188 predicate(vector_length(n) == 64);
5189 match(Set dst (MulVB src1 src2));
5190 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5191 format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5192 ins_encode %{
5193 assert(UseAVX > 2, "required");
5194 int vlen_enc = Assembler::AVX_512bit;
5195 __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5196 __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5197 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5198 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5199 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5200 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5201 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5202 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5203 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5204 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5205 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5206 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5207 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5208 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5209 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5210 %}
5211 ins_pipe( pipe_slow );
5212 %}
5213
5214 // Shorts/Chars vector mul
5215 instruct vmulS(vec dst, vec src) %{
5216 predicate(UseAVX == 0);
5217 match(Set dst (MulVS dst src));
5218 format %{ "pmullw $dst,$src\t! mul packedS" %}
5219 ins_encode %{
5220 __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5221 %}
5222 ins_pipe( pipe_slow );
5223 %}
5224
5225 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5226 predicate(UseAVX > 0);
5227 match(Set dst (MulVS src1 src2));
5228 format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5229 ins_encode %{
5230 int vlen_enc = vector_length_encoding(this);
5231 __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5232 %}
5233 ins_pipe( pipe_slow );
5234 %}
5235
5236 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5237 predicate(UseAVX > 0);
5238 match(Set dst (MulVS src (LoadVector mem)));
5239 format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5240 ins_encode %{
5241 int vlen_enc = vector_length_encoding(this);
5242 __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5243 %}
5244 ins_pipe( pipe_slow );
5245 %}
5246
5247 // Integers vector mul
5248 instruct vmulI(vec dst, vec src) %{
5249 predicate(UseAVX == 0);
5250 match(Set dst (MulVI dst src));
5251 format %{ "pmulld $dst,$src\t! mul packedI" %}
5252 ins_encode %{
5253 assert(UseSSE > 3, "required");
5254 __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5255 %}
5256 ins_pipe( pipe_slow );
5257 %}
5258
5259 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5260 predicate(UseAVX > 0);
5261 match(Set dst (MulVI src1 src2));
5262 format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5263 ins_encode %{
5264 int vlen_enc = vector_length_encoding(this);
5265 __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5266 %}
5267 ins_pipe( pipe_slow );
5268 %}
5269
5270 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5271 predicate(UseAVX > 0);
5272 match(Set dst (MulVI src (LoadVector mem)));
5273 format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5274 ins_encode %{
5275 int vlen_enc = vector_length_encoding(this);
5276 __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5277 %}
5278 ins_pipe( pipe_slow );
5279 %}
5280
5281 // Longs vector mul
5282 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5283 predicate(VM_Version::supports_avx512dq());
5284 match(Set dst (MulVL src1 src2));
5285 format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5286 ins_encode %{
5287 assert(UseAVX > 2, "required");
5288 int vlen_enc = vector_length_encoding(this);
5289 __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5290 %}
5291 ins_pipe( pipe_slow );
5292 %}
5293
5294 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5295 predicate(VM_Version::supports_avx512dq());
5296 match(Set dst (MulVL src (LoadVector mem)));
5297 format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5298 ins_encode %{
5299 assert(UseAVX > 2, "required");
5300 int vlen_enc = vector_length_encoding(this);
5301 __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5302 %}
5303 ins_pipe( pipe_slow );
5304 %}
5305
5306 instruct mul2L_reg(vec dst, vec src2, vec tmp) %{
5307 predicate(vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5308 match(Set dst (MulVL dst src2));
5309 effect(TEMP dst, TEMP tmp);
5310 format %{ "pshufd $tmp,$src2, 177\n\t"
5311 "pmulld $tmp,$dst\n\t"
5312 "phaddd $tmp,$tmp\n\t"
5313 "pmovzxdq $tmp,$tmp\n\t"
5314 "psllq $tmp, 32\n\t"
5315 "pmuludq $dst,$src2\n\t"
5316 "paddq $dst,$tmp\n\t! mul packed2L" %}
5317
5318 ins_encode %{
5319 assert(VM_Version::supports_sse4_1(), "required");
5320 int vlen_enc = Assembler::AVX_128bit;
5321 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5322 __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5323 __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5324 __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5325 __ psllq($tmp$$XMMRegister, 32);
5326 __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5327 __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5328 %}
5329 ins_pipe( pipe_slow );
5330 %}
5331
5332 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, vec tmp, vec tmp1) %{
5333 predicate(vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5334 match(Set dst (MulVL src1 src2));
5335 effect(TEMP tmp1, TEMP tmp);
5336 format %{ "vpshufd $tmp,$src2\n\t"
5337 "vpmulld $tmp,$src1,$tmp\n\t"
5338 "vphaddd $tmp,$tmp,$tmp\n\t"
5339 "vpmovzxdq $tmp,$tmp\n\t"
5340 "vpsllq $tmp,$tmp\n\t"
5341 "vpmuludq $tmp1,$src1,$src2\n\t"
5342 "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5343 ins_encode %{
5344 int vlen_enc = Assembler::AVX_256bit;
5345 __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5346 __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5347 __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5348 __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5349 __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5350 __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5351 __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5352 __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5353 %}
5354 ins_pipe( pipe_slow );
5355 %}
5356
5357 // Floats vector mul
5358 instruct vmulF(vec dst, vec src) %{
5359 predicate(UseAVX == 0);
5360 match(Set dst (MulVF dst src));
5361 format %{ "mulps $dst,$src\t! mul packedF" %}
5362 ins_encode %{
5363 __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5364 %}
5365 ins_pipe( pipe_slow );
5366 %}
5367
5368 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5369 predicate(UseAVX > 0);
5370 match(Set dst (MulVF src1 src2));
5371 format %{ "vmulps $dst,$src1,$src2\t! mul packedF" %}
5372 ins_encode %{
5373 int vlen_enc = vector_length_encoding(this);
5374 __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5375 %}
5376 ins_pipe( pipe_slow );
5377 %}
5378
5379 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5380 predicate(UseAVX > 0);
5381 match(Set dst (MulVF src (LoadVector mem)));
5382 format %{ "vmulps $dst,$src,$mem\t! mul packedF" %}
5383 ins_encode %{
5384 int vlen_enc = vector_length_encoding(this);
5385 __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5386 %}
5387 ins_pipe( pipe_slow );
5388 %}
5389
5390 // Doubles vector mul
5391 instruct vmulD(vec dst, vec src) %{
5392 predicate(UseAVX == 0);
5393 match(Set dst (MulVD dst src));
5394 format %{ "mulpd $dst,$src\t! mul packedD" %}
5395 ins_encode %{
5396 __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5397 %}
5398 ins_pipe( pipe_slow );
5399 %}
5400
5401 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5402 predicate(UseAVX > 0);
5403 match(Set dst (MulVD src1 src2));
5404 format %{ "vmulpd $dst,$src1,$src2\t! mul packedD" %}
5405 ins_encode %{
5406 int vlen_enc = vector_length_encoding(this);
5407 __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5408 %}
5409 ins_pipe( pipe_slow );
5410 %}
5411
5412 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5413 predicate(UseAVX > 0);
5414 match(Set dst (MulVD src (LoadVector mem)));
5415 format %{ "vmulpd $dst,$src,$mem\t! mul packedD" %}
5416 ins_encode %{
5417 int vlen_enc = vector_length_encoding(this);
5418 __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5419 %}
5420 ins_pipe( pipe_slow );
5421 %}
5422
5423 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5424 predicate(vector_length(n) == 8);
5425 match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5426 effect(TEMP dst, USE src1, USE src2);
5427 format %{ "cmpps.$copnd $dst, $src1, $src2 ! vcmovevf, cond=$cop\n\t"
5428 "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5429 %}
5430 ins_encode %{
5431 assert(UseAVX > 0, "required");
5432
5433 int vlen_enc = Assembler::AVX_256bit;
5434 int cond = (Assembler::Condition)($copnd$$cmpcode);
5435 __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5436 __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5437 %}
5438 ins_pipe( pipe_slow );
5439 %}
5440
5441 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5442 predicate(vector_length(n) == 4);
5443 match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5444 effect(TEMP dst, USE src1, USE src2);
5445 format %{ "cmppd.$copnd $dst, $src1, $src2 ! vcmovevd, cond=$cop\n\t"
5446 "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5447 %}
5448 ins_encode %{
5449 assert(UseAVX > 0, "required");
5450
5451 int vlen_enc = Assembler::AVX_256bit;
5452 int cond = (Assembler::Condition)($copnd$$cmpcode);
5453 __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5454 __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5455 %}
5456 ins_pipe( pipe_slow );
5457 %}
5458
5459 // --------------------------------- DIV --------------------------------------
5460
5461 // Floats vector div
5462 instruct vdivF(vec dst, vec src) %{
5463 predicate(UseAVX == 0);
5464 match(Set dst (DivVF dst src));
5465 format %{ "divps $dst,$src\t! div packedF" %}
5466 ins_encode %{
5467 __ divps($dst$$XMMRegister, $src$$XMMRegister);
5468 %}
5469 ins_pipe( pipe_slow );
5470 %}
5471
5472 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5473 predicate(UseAVX > 0);
5474 match(Set dst (DivVF src1 src2));
5475 format %{ "vdivps $dst,$src1,$src2\t! div packedF" %}
5476 ins_encode %{
5477 int vlen_enc = vector_length_encoding(this);
5478 __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5479 %}
5480 ins_pipe( pipe_slow );
5481 %}
5482
5483 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5484 predicate(UseAVX > 0);
5485 match(Set dst (DivVF src (LoadVector mem)));
5486 format %{ "vdivps $dst,$src,$mem\t! div packedF" %}
5487 ins_encode %{
5488 int vlen_enc = vector_length_encoding(this);
5489 __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5490 %}
5491 ins_pipe( pipe_slow );
5492 %}
5493
5494 // Doubles vector div
5495 instruct vdivD(vec dst, vec src) %{
5496 predicate(UseAVX == 0);
5497 match(Set dst (DivVD dst src));
5498 format %{ "divpd $dst,$src\t! div packedD" %}
5499 ins_encode %{
5500 __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5501 %}
5502 ins_pipe( pipe_slow );
5503 %}
5504
5505 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5506 predicate(UseAVX > 0);
5507 match(Set dst (DivVD src1 src2));
5508 format %{ "vdivpd $dst,$src1,$src2\t! div packedD" %}
5509 ins_encode %{
5510 int vlen_enc = vector_length_encoding(this);
5511 __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5512 %}
5513 ins_pipe( pipe_slow );
5514 %}
5515
5516 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5517 predicate(UseAVX > 0);
5518 match(Set dst (DivVD src (LoadVector mem)));
5519 format %{ "vdivpd $dst,$src,$mem\t! div packedD" %}
5520 ins_encode %{
5521 int vlen_enc = vector_length_encoding(this);
5522 __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5523 %}
5524 ins_pipe( pipe_slow );
5525 %}
5526
5527 // ------------------------------ MinMax ---------------------------------------
5528
5529 // Byte, Short, Int vector Min/Max
5530 instruct minmax_reg_sse(vec dst, vec src) %{
5531 predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5532 UseAVX == 0);
5533 match(Set dst (MinV dst src));
5534 match(Set dst (MaxV dst src));
5535 format %{ "vector_minmax $dst,$src\t! " %}
5536 ins_encode %{
5537 assert(UseSSE >= 4, "required");
5538
5539 int opcode = this->ideal_Opcode();
5540 BasicType elem_bt = vector_element_basic_type(this);
5541 __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5542 %}
5543 ins_pipe( pipe_slow );
5544 %}
5545
5546 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5547 predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5548 UseAVX > 0);
5549 match(Set dst (MinV src1 src2));
5550 match(Set dst (MaxV src1 src2));
5551 format %{ "vector_minmax $dst,$src1,$src2\t! " %}
5552 ins_encode %{
5553 int opcode = this->ideal_Opcode();
5554 int vlen_enc = vector_length_encoding(this);
5555 BasicType elem_bt = vector_element_basic_type(this);
5556
5557 __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5558 %}
5559 ins_pipe( pipe_slow );
5560 %}
5561
5562 // Long vector Min/Max
5563 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5564 predicate(vector_length_in_bytes(n) == 16 && vector_element_basic_type(n) == T_LONG &&
5565 UseAVX == 0);
5566 match(Set dst (MinV dst src));
5567 match(Set dst (MaxV src dst));
5568 effect(TEMP dst, TEMP tmp);
5569 format %{ "vector_minmaxL $dst,$src\t!using $tmp as TEMP" %}
5570 ins_encode %{
5571 assert(UseSSE >= 4, "required");
5572
5573 int opcode = this->ideal_Opcode();
5574 BasicType elem_bt = vector_element_basic_type(this);
5575 assert(elem_bt == T_LONG, "sanity");
5576
5577 __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5578 %}
5579 ins_pipe( pipe_slow );
5580 %}
5581
5582 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5583 predicate(vector_length_in_bytes(n) <= 32 && vector_element_basic_type(n) == T_LONG &&
5584 UseAVX > 0 && !VM_Version::supports_avx512vl());
5585 match(Set dst (MinV src1 src2));
5586 match(Set dst (MaxV src1 src2));
5587 effect(TEMP dst);
5588 format %{ "vector_minmaxL $dst,$src1,$src2\t! " %}
5589 ins_encode %{
5590 int vlen_enc = vector_length_encoding(this);
5591 int opcode = this->ideal_Opcode();
5592 BasicType elem_bt = vector_element_basic_type(this);
5593 assert(elem_bt == T_LONG, "sanity");
5594
5595 __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5596 %}
5597 ins_pipe( pipe_slow );
5598 %}
5599
5600 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5601 predicate((vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5602 vector_element_basic_type(n) == T_LONG);
5603 match(Set dst (MinV src1 src2));
5604 match(Set dst (MaxV src1 src2));
5605 format %{ "vector_minmaxL $dst,$src1,src2\t! " %}
5606 ins_encode %{
5607 assert(UseAVX > 2, "required");
5608
5609 int vlen_enc = vector_length_encoding(this);
5610 int opcode = this->ideal_Opcode();
5611 BasicType elem_bt = vector_element_basic_type(this);
5612 assert(elem_bt == T_LONG, "sanity");
5613
5614 __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5615 %}
5616 ins_pipe( pipe_slow );
5617 %}
5618
5619 // Float/Double vector Min/Max
5620 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
5621 predicate(vector_length_in_bytes(n) <= 32 &&
5622 is_floating_point_type(vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
5623 UseAVX > 0);
5624 match(Set dst (MinV a b));
5625 match(Set dst (MaxV a b));
5626 effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
5627 format %{ "vector_minmaxFP $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
5628 ins_encode %{
5629 assert(UseAVX > 0, "required");
5630
5631 int opcode = this->ideal_Opcode();
5632 int vlen_enc = vector_length_encoding(this);
5633 BasicType elem_bt = vector_element_basic_type(this);
5634
5635 __ vminmax_fp(opcode, elem_bt,
5636 $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5637 $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5638 %}
5639 ins_pipe( pipe_slow );
5640 %}
5641
5642 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp) %{
5643 predicate(vector_length_in_bytes(n) == 64 &&
5644 is_floating_point_type(vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
5645 match(Set dst (MinV a b));
5646 match(Set dst (MaxV a b));
5647 effect(USE a, USE b, TEMP atmp, TEMP btmp);
5648 format %{ "vector_minmaxFP $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
5649 ins_encode %{
5650 assert(UseAVX > 2, "required");
5651
5652 int opcode = this->ideal_Opcode();
5653 int vlen_enc = vector_length_encoding(this);
5654 BasicType elem_bt = vector_element_basic_type(this);
5655
5656 KRegister ktmp = k1;
5657 __ evminmax_fp(opcode, elem_bt,
5658 $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5659 ktmp, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5660 %}
5661 ins_pipe( pipe_slow );
5662 %}
5663
5664 // --------------------------------- Sqrt --------------------------------------
5665
5666 instruct vsqrtF_reg(vec dst, vec src) %{
5667 match(Set dst (SqrtVF src));
5668 format %{ "vsqrtps $dst,$src\t! sqrt packedF" %}
5669 ins_encode %{
5670 assert(UseAVX > 0, "required");
5671 int vlen_enc = vector_length_encoding(this);
5672 __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5673 %}
5674 ins_pipe( pipe_slow );
5675 %}
5676
5677 instruct vsqrtF_mem(vec dst, memory mem) %{
5678 match(Set dst (SqrtVF (LoadVector mem)));
5679 format %{ "vsqrtps $dst,$mem\t! sqrt packedF" %}
5680 ins_encode %{
5681 assert(UseAVX > 0, "required");
5682 int vlen_enc = vector_length_encoding(this);
5683 __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
5684 %}
5685 ins_pipe( pipe_slow );
5686 %}
5687
5688 // Floating point vector sqrt
5689 instruct vsqrtD_reg(vec dst, vec src) %{
5690 match(Set dst (SqrtVD src));
5691 format %{ "vsqrtpd $dst,$src\t! sqrt packedD" %}
5692 ins_encode %{
5693 assert(UseAVX > 0, "required");
5694 int vlen_enc = vector_length_encoding(this);
5695 __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5696 %}
5697 ins_pipe( pipe_slow );
5698 %}
5699
5700 instruct vsqrtD_mem(vec dst, memory mem) %{
5701 match(Set dst (SqrtVD (LoadVector mem)));
5702 format %{ "vsqrtpd $dst,$mem\t! sqrt packedD" %}
5703 ins_encode %{
5704 assert(UseAVX > 0, "required");
5705 int vlen_enc = vector_length_encoding(this);
5706 __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
5707 %}
5708 ins_pipe( pipe_slow );
5709 %}
5710
5711 // ------------------------------ Shift ---------------------------------------
5712
5713 // Left and right shift count vectors are the same on x86
5714 // (only lowest bits of xmm reg are used for count).
5715 instruct vshiftcnt(vec dst, rRegI cnt) %{
5716 match(Set dst (LShiftCntV cnt));
5717 match(Set dst (RShiftCntV cnt));
5718 format %{ "movdl $dst,$cnt\t! load shift count" %}
5719 ins_encode %{
5720 __ movdl($dst$$XMMRegister, $cnt$$Register);
5721 %}
5722 ins_pipe( pipe_slow );
5723 %}
5724
5725 // Byte vector shift
5726 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5727 predicate(vector_length(n) <= 8);
5728 match(Set dst ( LShiftVB src shift));
5729 match(Set dst ( RShiftVB src shift));
5730 match(Set dst (URShiftVB src shift));
5731 effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5732 format %{"vector_byte_shift $dst,$src,$shift" %}
5733 ins_encode %{
5734 assert(UseSSE > 3, "required");
5735 int opcode = this->ideal_Opcode();
5736 bool sign = (opcode == Op_URShiftVB) ? false : true;
5737 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5738 __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5739 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5740 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5741 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5742 %}
5743 ins_pipe( pipe_slow );
5744 %}
5745
5746 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5747 predicate(vector_length(n) == 16 && UseAVX <= 1);
5748 match(Set dst ( LShiftVB src shift));
5749 match(Set dst ( RShiftVB src shift));
5750 match(Set dst (URShiftVB src shift));
5751 effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5752 format %{"vector_byte_shift $dst,$src,$shift" %}
5753 ins_encode %{
5754 assert(UseSSE > 3, "required");
5755 int opcode = this->ideal_Opcode();
5756 bool sign = (opcode == Op_URShiftVB) ? false : true;
5757 __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5758 __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5759 __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5760 __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5761 __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5762 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5763 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5764 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5765 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5766 %}
5767 ins_pipe( pipe_slow );
5768 %}
5769
5770 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5771 predicate(vector_length(n) == 16 && UseAVX > 1);
5772 match(Set dst ( LShiftVB src shift));
5773 match(Set dst ( RShiftVB src shift));
5774 match(Set dst (URShiftVB src shift));
5775 effect(TEMP dst, TEMP tmp, TEMP scratch);
5776 format %{"vector_byte_shift $dst,$src,$shift" %}
5777 ins_encode %{
5778 int opcode = this->ideal_Opcode();
5779 bool sign = (opcode == Op_URShiftVB) ? false : true;
5780 int vlen_enc = Assembler::AVX_256bit;
5781 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5782 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5783 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5784 __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5785 __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5786 %}
5787 ins_pipe( pipe_slow );
5788 %}
5789
5790 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5791 predicate(vector_length(n) == 32);
5792 match(Set dst ( LShiftVB src shift));
5793 match(Set dst ( RShiftVB src shift));
5794 match(Set dst (URShiftVB src shift));
5795 effect(TEMP dst, TEMP tmp, TEMP scratch);
5796 format %{"vector_byte_shift $dst,$src,$shift" %}
5797 ins_encode %{
5798 assert(UseAVX > 1, "required");
5799 int opcode = this->ideal_Opcode();
5800 bool sign = (opcode == Op_URShiftVB) ? false : true;
5801 int vlen_enc = Assembler::AVX_256bit;
5802 __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
5803 __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5804 __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5805 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5806 __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5807 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5808 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5809 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5810 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5811 %}
5812 ins_pipe( pipe_slow );
5813 %}
5814
5815 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5816 predicate(vector_length(n) == 64);
5817 match(Set dst ( LShiftVB src shift));
5818 match(Set dst (RShiftVB src shift));
5819 match(Set dst (URShiftVB src shift));
5820 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5821 format %{"vector_byte_shift $dst,$src,$shift" %}
5822 ins_encode %{
5823 assert(UseAVX > 2, "required");
5824 int opcode = this->ideal_Opcode();
5825 bool sign = (opcode == Op_URShiftVB) ? false : true;
5826 int vlen_enc = Assembler::AVX_512bit;
5827 __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
5828 __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5829 __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
5830 __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5831 __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5832 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5833 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5834 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5835 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5836 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5837 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5838 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5839 %}
5840 ins_pipe( pipe_slow );
5841 %}
5842
5843 // Shorts vector logical right shift produces incorrect Java result
5844 // for negative data because java code convert short value into int with
5845 // sign extension before a shift. But char vectors are fine since chars are
5846 // unsigned values.
5847 // Shorts/Chars vector left shift
5848 instruct vshiftS(vec dst, vec src, vec shift) %{
5849 match(Set dst ( LShiftVS src shift));
5850 match(Set dst ( RShiftVS src shift));
5851 match(Set dst (URShiftVS src shift));
5852 effect(TEMP dst, USE src, USE shift);
5853 format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %}
5854 ins_encode %{
5855 int opcode = this->ideal_Opcode();
5856 if (UseAVX > 0) {
5857 int vlen_enc = vector_length_encoding(this);
5858 __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5859 } else {
5860 int vlen = vector_length(this);
5861 if (vlen == 2) {
5862 __ movflt($dst$$XMMRegister, $src$$XMMRegister);
5863 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
5864 } else if (vlen == 4) {
5865 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
5866 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
5867 } else {
5868 assert (vlen == 8, "sanity");
5869 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5870 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
5871 }
5872 }
5873 %}
5874 ins_pipe( pipe_slow );
5875 %}
5876
5877 // Integers vector left shift
5878 instruct vshiftI(vec dst, vec src, vec shift) %{
5879 match(Set dst ( LShiftVI src shift));
5880 match(Set dst ( RShiftVI src shift));
5881 match(Set dst (URShiftVI src shift));
5882 effect(TEMP dst, USE src, USE shift);
5883 format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %}
5884 ins_encode %{
5885 int opcode = this->ideal_Opcode();
5886 if (UseAVX > 0) {
5887 int vlen_enc = vector_length_encoding(this);
5888 __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5889 } else {
5890 int vlen = vector_length(this);
5891 if (vlen == 2) {
5892 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
5893 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
5894 } else {
5895 assert(vlen == 4, "sanity");
5896 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5897 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
5898 }
5899 }
5900 %}
5901 ins_pipe( pipe_slow );
5902 %}
5903
5904 // Longs vector shift
5905 instruct vshiftL(vec dst, vec src, vec shift) %{
5906 match(Set dst ( LShiftVL src shift));
5907 match(Set dst (URShiftVL src shift));
5908 effect(TEMP dst, USE src, USE shift);
5909 format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %}
5910 ins_encode %{
5911 int opcode = this->ideal_Opcode();
5912 if (UseAVX > 0) {
5913 int vlen_enc = vector_length_encoding(this);
5914 __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5915 } else {
5916 assert(vector_length(this) == 2, "");
5917 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5918 __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
5919 }
5920 %}
5921 ins_pipe( pipe_slow );
5922 %}
5923
5924 // -------------------ArithmeticRightShift -----------------------------------
5925 // Long vector arithmetic right shift
5926 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5927 predicate(UseAVX <= 2);
5928 match(Set dst (RShiftVL src shift));
5929 effect(TEMP dst, TEMP tmp, TEMP scratch);
5930 format %{ "vshiftq $dst,$src,$shift" %}
5931 ins_encode %{
5932 uint vlen = vector_length(this);
5933 if (vlen == 2) {
5934 assert(UseSSE >= 2, "required");
5935 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5936 __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
5937 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
5938 __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
5939 __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
5940 __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
5941 } else {
5942 assert(vlen == 4, "sanity");
5943 assert(UseAVX > 1, "required");
5944 int vlen_enc = Assembler::AVX_256bit;
5945 __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5946 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
5947 __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5948 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5949 __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5950 }
5951 %}
5952 ins_pipe( pipe_slow );
5953 %}
5954
5955 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
5956 predicate(UseAVX > 2);
5957 match(Set dst (RShiftVL src shift));
5958 format %{ "vshiftq $dst,$src,$shift" %}
5959 ins_encode %{
5960 int vlen_enc = vector_length_encoding(this);
5961 __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5962 %}
5963 ins_pipe( pipe_slow );
5964 %}
5965
5966 // ------------------- Variable Shift -----------------------------
5967 // Byte variable shift
5968 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
5969 predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_BYTE &&
5970 !VM_Version::supports_avx512bw());
5971 match(Set dst ( VLShiftV src shift));
5972 match(Set dst ( VRShiftV src shift));
5973 match(Set dst (VURShiftV src shift));
5974 effect(TEMP dst, TEMP vtmp, TEMP scratch);
5975 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
5976 ins_encode %{
5977 assert(UseAVX >= 2, "required");
5978
5979 int opcode = this->ideal_Opcode();
5980 int vlen_enc = Assembler::AVX_128bit;
5981 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
5982 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
5983 %}
5984 ins_pipe( pipe_slow );
5985 %}
5986
5987 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
5988 predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_BYTE &&
5989 !VM_Version::supports_avx512bw());
5990 match(Set dst ( VLShiftV src shift));
5991 match(Set dst ( VRShiftV src shift));
5992 match(Set dst (VURShiftV src shift));
5993 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
5994 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
5995 ins_encode %{
5996 assert(UseAVX >= 2, "required");
5997
5998 int opcode = this->ideal_Opcode();
5999 int vlen_enc = Assembler::AVX_128bit;
6000 // Shift lower half and get word result in dst
6001 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6002
6003 // Shift upper half and get word result in vtmp1
6004 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6005 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6006 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6007
6008 // Merge and down convert the two word results to byte in dst
6009 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6010 %}
6011 ins_pipe( pipe_slow );
6012 %}
6013
6014 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6015 predicate(vector_length(n) == 32 && vector_element_basic_type(n) == T_BYTE &&
6016 !VM_Version::supports_avx512bw());
6017 match(Set dst ( VLShiftV src shift));
6018 match(Set dst ( VRShiftV src shift));
6019 match(Set dst (VURShiftV src shift));
6020 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6021 format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6022 ins_encode %{
6023 assert(UseAVX >= 2, "required");
6024
6025 int opcode = this->ideal_Opcode();
6026 int vlen_enc = Assembler::AVX_128bit;
6027 // Process lower 128 bits and get result in dst
6028 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6029 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6030 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6031 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6032 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6033
6034 // Process higher 128 bits and get result in vtmp3
6035 __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6036 __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6037 __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6038 __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6039 __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6040 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6041 __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6042
6043 // Merge the two results in dst
6044 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6045 %}
6046 ins_pipe( pipe_slow );
6047 %}
6048
6049 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6050 predicate(vector_length(n) <= 32 && vector_element_basic_type(n) == T_BYTE &&
6051 VM_Version::supports_avx512bw());
6052 match(Set dst ( VLShiftV src shift));
6053 match(Set dst ( VRShiftV src shift));
6054 match(Set dst (VURShiftV src shift));
6055 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6056 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6057 ins_encode %{
6058 assert(UseAVX > 2, "required");
6059
6060 int opcode = this->ideal_Opcode();
6061 int vlen_enc = vector_length_encoding(this);
6062 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6063 %}
6064 ins_pipe( pipe_slow );
6065 %}
6066
6067 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6068 predicate(vector_length(n) == 64 && vector_element_basic_type(n) == T_BYTE &&
6069 VM_Version::supports_avx512bw());
6070 match(Set dst ( VLShiftV src shift));
6071 match(Set dst ( VRShiftV src shift));
6072 match(Set dst (VURShiftV src shift));
6073 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6074 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6075 ins_encode %{
6076 assert(UseAVX > 2, "required");
6077
6078 int opcode = this->ideal_Opcode();
6079 int vlen_enc = Assembler::AVX_256bit;
6080 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6081 __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6082 __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6083 __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6084 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6085 %}
6086 ins_pipe( pipe_slow );
6087 %}
6088
6089 // Short variable shift
6090 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6091 predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_SHORT &&
6092 !VM_Version::supports_avx512bw());
6093 match(Set dst (VLShiftV src shift));
6094 match(Set dst (VRShiftV src shift));
6095 match(Set dst (VURShiftV src shift));
6096 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6097 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6098 ins_encode %{
6099 assert(UseAVX >= 2, "required");
6100
6101 int opcode = this->ideal_Opcode();
6102 bool sign = (opcode == Op_VURShiftV) ? false : true;
6103 int vlen_enc = Assembler::AVX_256bit;
6104 __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6105 __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6106 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6107 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6108 __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6109 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6110 %}
6111 ins_pipe( pipe_slow );
6112 %}
6113
6114 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6115 predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_SHORT &&
6116 !VM_Version::supports_avx512bw());
6117 match(Set dst (VLShiftV src shift));
6118 match(Set dst (VRShiftV src shift));
6119 match(Set dst (VURShiftV src shift));
6120 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6121 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6122 ins_encode %{
6123 assert(UseAVX >= 2, "required");
6124
6125 int opcode = this->ideal_Opcode();
6126 bool sign = (opcode == Op_VURShiftV) ? false : true;
6127 int vlen_enc = Assembler::AVX_256bit;
6128 // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6129 __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6130 __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6131 __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6132 __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6133
6134 // Shift upper half, with result in dst usign vtmp1 as TEMP
6135 __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6136 __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6137 __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6138 __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6139 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6140 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6141
6142 // Merge lower and upper half result into dst
6143 __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6144 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6145 %}
6146 ins_pipe( pipe_slow );
6147 %}
6148
6149 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6150 predicate(vector_element_basic_type(n) == T_SHORT &&
6151 VM_Version::supports_avx512bw());
6152 match(Set dst (VLShiftV src shift));
6153 match(Set dst (VRShiftV src shift));
6154 match(Set dst (VURShiftV src shift));
6155 format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6156 ins_encode %{
6157 assert(UseAVX > 2, "required");
6158
6159 int opcode = this->ideal_Opcode();
6160 int vlen_enc = vector_length_encoding(this);
6161 if (!VM_Version::supports_avx512vl()) {
6162 vlen_enc = Assembler::AVX_512bit;
6163 }
6164 __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6165 %}
6166 ins_pipe( pipe_slow );
6167 %}
6168
6169 //Integer variable shift
6170 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6171 predicate(vector_element_basic_type(n) == T_INT);
6172 match(Set dst ( VLShiftV src shift));
6173 match(Set dst ( VRShiftV src shift));
6174 match(Set dst (VURShiftV src shift));
6175 format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6176 ins_encode %{
6177 assert(UseAVX >= 2, "required");
6178
6179 int opcode = this->ideal_Opcode();
6180 int vlen_enc = vector_length_encoding(this);
6181 __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6182 %}
6183 ins_pipe( pipe_slow );
6184 %}
6185
6186 //Long variable shift
6187 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6188 predicate(vector_element_basic_type(n) == T_LONG);
6189 match(Set dst ( VLShiftV src shift));
6190 match(Set dst (VURShiftV src shift));
6191 format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6192 ins_encode %{
6193 assert(UseAVX >= 2, "required");
6194
6195 int opcode = this->ideal_Opcode();
6196 int vlen_enc = vector_length_encoding(this);
6197 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6198 %}
6199 ins_pipe( pipe_slow );
6200 %}
6201
6202 //Long variable right shift arithmetic
6203 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6204 predicate(vector_length(n) <= 4 && vector_element_basic_type(n) == T_LONG &&
6205 UseAVX == 2);
6206 match(Set dst (VRShiftV src shift));
6207 effect(TEMP dst, TEMP vtmp);
6208 format %{ "vector_varshift_long $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6209 ins_encode %{
6210 int opcode = this->ideal_Opcode();
6211 int vlen_enc = vector_length_encoding(this);
6212 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6213 $vtmp$$XMMRegister);
6214 %}
6215 ins_pipe( pipe_slow );
6216 %}
6217
6218 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6219 predicate(vector_element_basic_type(n) == T_LONG &&
6220 UseAVX > 2);
6221 match(Set dst (VRShiftV src shift));
6222 format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6223 ins_encode %{
6224 int opcode = this->ideal_Opcode();
6225 int vlen_enc = vector_length_encoding(this);
6226 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6227 %}
6228 ins_pipe( pipe_slow );
6229 %}
6230
6231 // --------------------------------- AND --------------------------------------
6232
6233 instruct vand(vec dst, vec src) %{
6234 predicate(UseAVX == 0);
6235 match(Set dst (AndV dst src));
6236 format %{ "pand $dst,$src\t! and vectors" %}
6237 ins_encode %{
6238 __ pand($dst$$XMMRegister, $src$$XMMRegister);
6239 %}
6240 ins_pipe( pipe_slow );
6241 %}
6242
6243 instruct vand_reg(vec dst, vec src1, vec src2) %{
6244 predicate(UseAVX > 0);
6245 match(Set dst (AndV src1 src2));
6246 format %{ "vpand $dst,$src1,$src2\t! and vectors" %}
6247 ins_encode %{
6248 int vlen_enc = vector_length_encoding(this);
6249 __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6250 %}
6251 ins_pipe( pipe_slow );
6252 %}
6253
6254 instruct vand_mem(vec dst, vec src, memory mem) %{
6255 predicate(UseAVX > 0);
6256 match(Set dst (AndV src (LoadVector mem)));
6257 format %{ "vpand $dst,$src,$mem\t! and vectors" %}
6258 ins_encode %{
6259 int vlen_enc = vector_length_encoding(this);
6260 __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6261 %}
6262 ins_pipe( pipe_slow );
6263 %}
6264
6265 // --------------------------------- OR ---------------------------------------
6266
6267 instruct vor(vec dst, vec src) %{
6268 predicate(UseAVX == 0);
6269 match(Set dst (OrV dst src));
6270 format %{ "por $dst,$src\t! or vectors" %}
6271 ins_encode %{
6272 __ por($dst$$XMMRegister, $src$$XMMRegister);
6273 %}
6274 ins_pipe( pipe_slow );
6275 %}
6276
6277 instruct vor_reg(vec dst, vec src1, vec src2) %{
6278 predicate(UseAVX > 0);
6279 match(Set dst (OrV src1 src2));
6280 format %{ "vpor $dst,$src1,$src2\t! or vectors" %}
6281 ins_encode %{
6282 int vlen_enc = vector_length_encoding(this);
6283 __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6284 %}
6285 ins_pipe( pipe_slow );
6286 %}
6287
6288 instruct vor_mem(vec dst, vec src, memory mem) %{
6289 predicate(UseAVX > 0);
6290 match(Set dst (OrV src (LoadVector mem)));
6291 format %{ "vpor $dst,$src,$mem\t! or vectors" %}
6292 ins_encode %{
6293 int vlen_enc = vector_length_encoding(this);
6294 __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6295 %}
6296 ins_pipe( pipe_slow );
6297 %}
6298
6299 // --------------------------------- XOR --------------------------------------
6300
6301 instruct vxor(vec dst, vec src) %{
6302 predicate(UseAVX == 0);
6303 match(Set dst (XorV dst src));
6304 format %{ "pxor $dst,$src\t! xor vectors" %}
6305 ins_encode %{
6306 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6307 %}
6308 ins_pipe( pipe_slow );
6309 %}
6310
6311 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6312 predicate(UseAVX > 0);
6313 match(Set dst (XorV src1 src2));
6314 format %{ "vpxor $dst,$src1,$src2\t! xor vectors" %}
6315 ins_encode %{
6316 int vlen_enc = vector_length_encoding(this);
6317 __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6318 %}
6319 ins_pipe( pipe_slow );
6320 %}
6321
6322 instruct vxor_mem(vec dst, vec src, memory mem) %{
6323 predicate(UseAVX > 0);
6324 match(Set dst (XorV src (LoadVector mem)));
6325 format %{ "vpxor $dst,$src,$mem\t! xor vectors" %}
6326 ins_encode %{
6327 int vlen_enc = vector_length_encoding(this);
6328 __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6329 %}
6330 ins_pipe( pipe_slow );
6331 %}
6332
6333 // --------------------------------- VectorCast --------------------------------------
6334
6335 instruct vcastBtoX(vec dst, vec src) %{
6336 match(Set dst (VectorCastB2X src));
6337 format %{ "vector_cast_b2x $dst,$src\t!" %}
6338 ins_encode %{
6339 assert(UseAVX > 0, "required");
6340
6341 BasicType to_elem_bt = vector_element_basic_type(this);
6342 int vlen_enc = vector_length_encoding(this);
6343 switch (to_elem_bt) {
6344 case T_SHORT:
6345 __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6346 break;
6347 case T_INT:
6348 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6349 break;
6350 case T_FLOAT:
6351 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6352 __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6353 break;
6354 case T_LONG:
6355 __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6356 break;
6357 case T_DOUBLE:
6358 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6359 __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6360 break;
6361
6362 default: assert(false, "%s", type2name(to_elem_bt));
6363 }
6364 %}
6365 ins_pipe( pipe_slow );
6366 %}
6367
6368 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6369 predicate(UseAVX <= 2 &&
6370 vector_length(n->in(1)) <= 8 && // src
6371 vector_element_basic_type(n) == T_BYTE);
6372 effect(TEMP scratch);
6373 match(Set dst (VectorCastS2X src));
6374 format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6375 ins_encode %{
6376 assert(UseAVX > 0, "required");
6377
6378 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6379 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6380 %}
6381 ins_pipe( pipe_slow );
6382 %}
6383
6384 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6385 predicate(UseAVX <= 2 &&
6386 vector_length(n->in(1)) == 16 && // src
6387 vector_element_basic_type(n) == T_BYTE);
6388 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6389 match(Set dst (VectorCastS2X src));
6390 format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6391 ins_encode %{
6392 assert(UseAVX > 0, "required");
6393
6394 int vlen_enc = vector_length_encoding(vector_length_in_bytes(this, $src));
6395 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6396 __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6397 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6398 %}
6399 ins_pipe( pipe_slow );
6400 %}
6401
6402 instruct vcastStoX_evex(vec dst, vec src) %{
6403 predicate(UseAVX > 2 ||
6404 (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src
6405 match(Set dst (VectorCastS2X src));
6406 format %{ "vector_cast_s2x $dst,$src\t!" %}
6407 ins_encode %{
6408 BasicType to_elem_bt = vector_element_basic_type(this);
6409 int src_vlen_enc = vector_length_encoding(this, $src);
6410 int vlen_enc = vector_length_encoding(this);
6411 switch (to_elem_bt) {
6412 case T_BYTE:
6413 if (!VM_Version::supports_avx512vl()) {
6414 vlen_enc = Assembler::AVX_512bit;
6415 }
6416 __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6417 break;
6418 case T_INT:
6419 __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6420 break;
6421 case T_FLOAT:
6422 __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6423 __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6424 break;
6425 case T_LONG:
6426 __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6427 break;
6428 case T_DOUBLE:
6429 __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6430 __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6431 break;
6432 default:
6433 ShouldNotReachHere();
6434 }
6435 %}
6436 ins_pipe( pipe_slow );
6437 %}
6438
6439 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6440 predicate(UseAVX <= 2 &&
6441 (vector_length_in_bytes(n->in(1)) <= 16) &&
6442 (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src
6443 match(Set dst (VectorCastI2X src));
6444 format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6445 effect(TEMP scratch);
6446 ins_encode %{
6447 assert(UseAVX > 0, "required");
6448
6449 BasicType to_elem_bt = vector_element_basic_type(this);
6450 int vlen_enc = vector_length_encoding(this, $src);
6451
6452 if (to_elem_bt == T_BYTE) {
6453 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6454 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6455 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6456 } else {
6457 assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6458 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6459 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6460 }
6461 %}
6462 ins_pipe( pipe_slow );
6463 %}
6464
6465 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6466 predicate(UseAVX <= 2 &&
6467 (vector_length_in_bytes(n->in(1)) == 32) &&
6468 (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src
6469 match(Set dst (VectorCastI2X src));
6470 format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
6471 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6472 ins_encode %{
6473 assert(UseAVX > 0, "required");
6474
6475 BasicType to_elem_bt = vector_element_basic_type(this);
6476 int vlen_enc = vector_length_encoding(this, $src);
6477
6478 if (to_elem_bt == T_BYTE) {
6479 __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6480 __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6481 __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6482 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6483 } else {
6484 assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6485 __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6486 __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6487 __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6488 }
6489 %}
6490 ins_pipe( pipe_slow );
6491 %}
6492
6493 instruct vcastItoX_evex(vec dst, vec src) %{
6494 predicate(UseAVX > 2 ||
6495 (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src
6496 match(Set dst (VectorCastI2X src));
6497 format %{ "vector_cast_i2x $dst,$src\t!" %}
6498 ins_encode %{
6499 assert(UseAVX > 0, "required");
6500
6501 BasicType dst_elem_bt = vector_element_basic_type(this);
6502 int src_vlen_enc = vector_length_encoding(this, $src);
6503 int dst_vlen_enc = vector_length_encoding(this);
6504 switch (dst_elem_bt) {
6505 case T_BYTE:
6506 if (!VM_Version::supports_avx512vl()) {
6507 src_vlen_enc = Assembler::AVX_512bit;
6508 }
6509 __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6510 break;
6511 case T_SHORT:
6512 if (!VM_Version::supports_avx512vl()) {
6513 src_vlen_enc = Assembler::AVX_512bit;
6514 }
6515 __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6516 break;
6517 case T_FLOAT:
6518 __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
6519 break;
6520 case T_LONG:
6521 __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6522 break;
6523 case T_DOUBLE:
6524 __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
6525 break;
6526 default:
6527 ShouldNotReachHere();
6528 }
6529 %}
6530 ins_pipe( pipe_slow );
6531 %}
6532
6533 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
6534 predicate((vector_element_basic_type(n) == T_BYTE || vector_element_basic_type(n) == T_SHORT) &&
6535 UseAVX <= 2);
6536 match(Set dst (VectorCastL2X src));
6537 effect(TEMP scratch);
6538 format %{ "vector_cast_l2x $dst,$src\t! using $scratch as TEMP" %}
6539 ins_encode %{
6540 assert(UseAVX > 0, "required");
6541
6542 int vlen = vector_length_in_bytes(this, $src);
6543 BasicType to_elem_bt = vector_element_basic_type(this);
6544 AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
6545 : ExternalAddress(vector_int_to_short_mask());
6546 if (vlen <= 16) {
6547 __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
6548 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6549 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6550 } else {
6551 assert(vlen <= 32, "required");
6552 __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
6553 __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
6554 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6555 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6556 }
6557 if (to_elem_bt == T_BYTE) {
6558 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6559 }
6560 %}
6561 ins_pipe( pipe_slow );
6562 %}
6563
6564 instruct vcastLtoX_evex(vec dst, vec src) %{
6565 predicate(UseAVX > 2 ||
6566 (vector_element_basic_type(n) == T_INT ||
6567 vector_element_basic_type(n) == T_FLOAT ||
6568 vector_element_basic_type(n) == T_DOUBLE));
6569 match(Set dst (VectorCastL2X src));
6570 format %{ "vector_cast_l2x $dst,$src\t!" %}
6571 ins_encode %{
6572 BasicType to_elem_bt = vector_element_basic_type(this);
6573 int vlen = vector_length_in_bytes(this, $src);
6574 int vlen_enc = vector_length_encoding(this, $src);
6575 switch (to_elem_bt) {
6576 case T_BYTE:
6577 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6578 vlen_enc = Assembler::AVX_512bit;
6579 }
6580 __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6581 break;
6582 case T_SHORT:
6583 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6584 vlen_enc = Assembler::AVX_512bit;
6585 }
6586 __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6587 break;
6588 case T_INT:
6589 if (vlen == 8) {
6590 if ($dst$$XMMRegister != $src$$XMMRegister) {
6591 __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6592 }
6593 } else if (vlen == 16) {
6594 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
6595 } else if (vlen == 32) {
6596 if (UseAVX > 2) {
6597 if (!VM_Version::supports_avx512vl()) {
6598 vlen_enc = Assembler::AVX_512bit;
6599 }
6600 __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6601 } else {
6602 __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
6603 __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
6604 }
6605 } else { // vlen == 64
6606 __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6607 }
6608 break;
6609 case T_FLOAT:
6610 assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6611 __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6612 break;
6613 case T_DOUBLE:
6614 assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6615 __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6616 break;
6617
6618 default: assert(false, "%s", type2name(to_elem_bt));
6619 }
6620 %}
6621 ins_pipe( pipe_slow );
6622 %}
6623
6624 instruct vcastFtoD_reg(vec dst, vec src) %{
6625 predicate(vector_element_basic_type(n) == T_DOUBLE);
6626 match(Set dst (VectorCastF2X src));
6627 format %{ "vector_cast_f2x $dst,$src\t!" %}
6628 ins_encode %{
6629 int vlen_enc = vector_length_encoding(this);
6630 __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6631 %}
6632 ins_pipe( pipe_slow );
6633 %}
6634
6635 instruct vcastDtoF_reg(vec dst, vec src) %{
6636 predicate(vector_element_basic_type(n) == T_FLOAT);
6637 match(Set dst (VectorCastD2X src));
6638 format %{ "vector_cast_d2x $dst,$src\t!" %}
6639 ins_encode %{
6640 int vlen_enc = vector_length_encoding(this, $src);
6641 __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6642 %}
6643 ins_pipe( pipe_slow );
6644 %}
6645
6646 // --------------------------------- VectorMaskCmp --------------------------------------
6647
6648 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6649 predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1
6650 vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6651 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6652 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6653 format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6654 ins_encode %{
6655 int vlen_enc = vector_length_encoding(this, $src1);
6656 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6657 if (vector_element_basic_type(this, $src1) == T_FLOAT)
6658 __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6659 else
6660 __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6661 %}
6662 ins_pipe( pipe_slow );
6663 %}
6664
6665 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{
6666 predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6667 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6668 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6669 effect(TEMP scratch);
6670 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6671 ins_encode %{
6672 int vlen_enc = Assembler::AVX_512bit;
6673 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6674 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
6675 KRegister mask = k0; // The comparison itself is not being masked.
6676 if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6677 __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6678 __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6679 } else {
6680 __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6681 __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6682 }
6683 %}
6684 ins_pipe( pipe_slow );
6685 %}
6686
6687 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
6688 predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1
6689 vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6690 is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6691 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6692 effect(TEMP scratch);
6693 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6694 ins_encode %{
6695 int vlen_enc = vector_length_encoding(this, $src1);
6696 Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6697 Assembler::Width ww = widthForType(vector_element_basic_type(this, $src1));
6698 __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
6699 %}
6700 ins_pipe( pipe_slow );
6701 %}
6702
6703 instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{
6704 predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6705 is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6706 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6707 effect(TEMP scratch);
6708 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6709 ins_encode %{
6710 assert(UseAVX > 2, "required");
6711
6712 int vlen_enc = Assembler::AVX_512bit;
6713 Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6714 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
6715 KRegister mask = k0; // The comparison itself is not being masked.
6716 bool merge = false;
6717 BasicType src1_elem_bt = vector_element_basic_type(this, $src1);
6718
6719 switch (src1_elem_bt) {
6720 case T_BYTE: {
6721 __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6722 __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6723 break;
6724 }
6725 case T_SHORT: {
6726 __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6727 __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6728 break;
6729 }
6730 case T_INT: {
6731 __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6732 __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6733 break;
6734 }
6735 case T_LONG: {
6736 __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6737 __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6738 break;
6739 }
6740
6741 default: assert(false, "%s", type2name(src1_elem_bt));
6742 }
6743 %}
6744 ins_pipe( pipe_slow );
6745 %}
6746
6747 // Extract
6748
6749 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
6750 predicate(vector_length_in_bytes(n->in(1)) <= 16); // src
6751 match(Set dst (ExtractI src idx));
6752 match(Set dst (ExtractS src idx));
6753 #ifdef _LP64
6754 match(Set dst (ExtractB src idx));
6755 #endif
6756 ins_encode %{
6757 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6758
6759 BasicType elem_bt = vector_element_basic_type(this, $src);
6760 __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
6761 %}
6762 ins_pipe( pipe_slow );
6763 %}
6764
6765 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
6766 predicate(vector_length_in_bytes(n->in(1)) == 32 || // src
6767 vector_length_in_bytes(n->in(1)) == 64); // src
6768 match(Set dst (ExtractI src idx));
6769 match(Set dst (ExtractS src idx));
6770 #ifdef _LP64
6771 match(Set dst (ExtractB src idx));
6772 #endif
6773 effect(TEMP vtmp);
6774 ins_encode %{
6775 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6776
6777 BasicType elem_bt = vector_element_basic_type(this, $src);
6778 XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6779 __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
6780 %}
6781 ins_pipe( pipe_slow );
6782 %}
6783
6784 #ifdef _LP64
6785 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
6786 predicate(vector_length(n->in(1)) <= 2); // src
6787 match(Set dst (ExtractL src idx));
6788 ins_encode %{
6789 assert(UseSSE >= 4, "required");
6790 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6791
6792 __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
6793 %}
6794 ins_pipe( pipe_slow );
6795 %}
6796
6797 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
6798 predicate(vector_length(n->in(1)) == 4 || // src
6799 vector_length(n->in(1)) == 8); // src
6800 match(Set dst (ExtractL src idx));
6801 effect(TEMP vtmp);
6802 ins_encode %{
6803 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6804
6805 XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6806 __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
6807 %}
6808 ins_pipe( pipe_slow );
6809 %}
6810 #endif
6811
6812 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
6813 predicate(vector_length(n->in(1)) <= 4);
6814 match(Set dst (ExtractF src idx));
6815 effect(TEMP dst, TEMP tmp, TEMP vtmp);
6816 ins_encode %{
6817 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6818
6819 __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
6820 %}
6821 ins_pipe( pipe_slow );
6822 %}
6823
6824 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
6825 predicate(vector_length(n->in(1)/*src*/) == 8 ||
6826 vector_length(n->in(1)/*src*/) == 16);
6827 match(Set dst (ExtractF src idx));
6828 effect(TEMP tmp, TEMP vtmp);
6829 ins_encode %{
6830 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6831
6832 XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6833 __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
6834 %}
6835 ins_pipe( pipe_slow );
6836 %}
6837
6838 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
6839 predicate(vector_length(n->in(1)) == 2); // src
6840 match(Set dst (ExtractD src idx));
6841 ins_encode %{
6842 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6843
6844 __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6845 %}
6846 ins_pipe( pipe_slow );
6847 %}
6848
6849 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
6850 predicate(vector_length(n->in(1)) == 4 || // src
6851 vector_length(n->in(1)) == 8); // src
6852 match(Set dst (ExtractD src idx));
6853 effect(TEMP vtmp);
6854 ins_encode %{
6855 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6856
6857 XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6858 __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
6859 %}
6860 ins_pipe( pipe_slow );
6861 %}
6862
6863 // --------------------------------- Vector Blend --------------------------------------
6864
6865 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
6866 predicate(UseAVX == 0);
6867 match(Set dst (VectorBlend (Binary dst src) mask));
6868 format %{ "vector_blend $dst,$src,$mask\t! using $tmp as TEMP" %}
6869 effect(TEMP tmp);
6870 ins_encode %{
6871 assert(UseSSE >= 4, "required");
6872
6873 if ($mask$$XMMRegister != $tmp$$XMMRegister) {
6874 __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
6875 }
6876 __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
6877 %}
6878 ins_pipe( pipe_slow );
6879 %}
6880
6881 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
6882 predicate(UseAVX > 0 &&
6883 vector_length_in_bytes(n) <= 32 &&
6884 is_integral_type(vector_element_basic_type(n)));
6885 match(Set dst (VectorBlend (Binary src1 src2) mask));
6886 format %{ "vector_blend $dst,$src1,$src2,$mask\t!" %}
6887 ins_encode %{
6888 int vlen_enc = vector_length_encoding(this);
6889 __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
6890 %}
6891 ins_pipe( pipe_slow );
6892 %}
6893
6894 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
6895 predicate(UseAVX > 0 &&
6896 vector_length_in_bytes(n) <= 32 &&
6897 !is_integral_type(vector_element_basic_type(n)));
6898 match(Set dst (VectorBlend (Binary src1 src2) mask));
6899 format %{ "vector_blend $dst,$src1,$src2,$mask\t!" %}
6900 ins_encode %{
6901 int vlen_enc = vector_length_encoding(this);
6902 __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
6903 %}
6904 ins_pipe( pipe_slow );
6905 %}
6906
6907 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch) %{
6908 predicate(vector_length_in_bytes(n) == 64);
6909 match(Set dst (VectorBlend (Binary src1 src2) mask));
6910 format %{ "vector_blend $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
6911 effect(TEMP scratch);
6912 ins_encode %{
6913 int vlen_enc = Assembler::AVX_512bit;
6914 BasicType elem_bt = vector_element_basic_type(this);
6915 KRegister ktmp = k2;
6916 __ evpcmp(elem_bt, ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
6917 __ evpblend(elem_bt, $dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
6918 %}
6919 ins_pipe( pipe_slow );
6920 %}
6921
6922 // --------------------------------- ABS --------------------------------------
6923 // a = |a|
6924 instruct vabsB_reg(vec dst, vec src) %{
6925 match(Set dst (AbsVB src));
6926 format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
6927 ins_encode %{
6928 uint vlen = vector_length(this);
6929 if (vlen <= 16) {
6930 __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
6931 } else {
6932 int vlen_enc = vector_length_encoding(this);
6933 __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6934 }
6935 %}
6936 ins_pipe( pipe_slow );
6937 %}
6938
6939 instruct vabsS_reg(vec dst, vec src) %{
6940 match(Set dst (AbsVS src));
6941 format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
6942 ins_encode %{
6943 uint vlen = vector_length(this);
6944 if (vlen <= 8) {
6945 __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
6946 } else {
6947 int vlen_enc = vector_length_encoding(this);
6948 __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6949 }
6950 %}
6951 ins_pipe( pipe_slow );
6952 %}
6953
6954 instruct vabsI_reg(vec dst, vec src) %{
6955 match(Set dst (AbsVI src));
6956 format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
6957 ins_encode %{
6958 uint vlen = vector_length(this);
6959 if (vlen <= 4) {
6960 __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
6961 } else {
6962 int vlen_enc = vector_length_encoding(this);
6963 __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6964 }
6965 %}
6966 ins_pipe( pipe_slow );
6967 %}
6968
6969 instruct vabsL_reg(vec dst, vec src) %{
6970 match(Set dst (AbsVL src));
6971 format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
6972 ins_encode %{
6973 assert(UseAVX > 2, "required");
6974 int vlen_enc = vector_length_encoding(this);
6975 if (!VM_Version::supports_avx512vl()) {
6976 vlen_enc = Assembler::AVX_512bit;
6977 }
6978 __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6979 %}
6980 ins_pipe( pipe_slow );
6981 %}
6982
6983 // --------------------------------- ABSNEG --------------------------------------
6984
6985 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
6986 predicate(vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
6987 match(Set dst (AbsVF src));
6988 match(Set dst (NegVF src));
6989 effect(TEMP scratch);
6990 format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
6991 ins_cost(150);
6992 ins_encode %{
6993 int opcode = this->ideal_Opcode();
6994 int vlen = vector_length(this);
6995 if (vlen == 2) {
6996 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
6997 } else {
6998 assert(vlen == 8 || vlen == 16, "required");
6999 int vlen_enc = vector_length_encoding(this);
7000 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7001 }
7002 %}
7003 ins_pipe( pipe_slow );
7004 %}
7005
7006 instruct vabsneg4F(vec dst, rRegI scratch) %{
7007 predicate(vector_length(n) == 4);
7008 match(Set dst (AbsVF dst));
7009 match(Set dst (NegVF dst));
7010 effect(TEMP scratch);
7011 format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7012 ins_cost(150);
7013 ins_encode %{
7014 int opcode = this->ideal_Opcode();
7015 __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7016 %}
7017 ins_pipe( pipe_slow );
7018 %}
7019
7020 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7021 match(Set dst (AbsVD src));
7022 match(Set dst (NegVD src));
7023 effect(TEMP scratch);
7024 format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7025 ins_encode %{
7026 int opcode = this->ideal_Opcode();
7027 uint vlen = vector_length(this);
7028 if (vlen == 2) {
7029 assert(UseSSE >= 2, "required");
7030 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7031 } else {
7032 int vlen_enc = vector_length_encoding(this);
7033 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7034 }
7035 %}
7036 ins_pipe( pipe_slow );
7037 %}
7038
7039 //------------------------------------- NOT --------------------------------------------
7040
7041 instruct vnotB(vec dst, vec src) %{
7042 predicate(UseAVX == 0);
7043 match(Set dst (NotV src));
7044 effect(TEMP dst);
7045 format %{ "vector_not $dst,$src\t!" %}
7046 ins_encode %{
7047 int vlen = vector_length_in_bytes(this);
7048 switch(vlen) {
7049 default:
7050 assert(0, "Incorrect vector length");
7051 break;
7052 case 4: {
7053 __ movdl($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
7054 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7055 } break;
7056 case 8: {
7057 __ movq($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
7058 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7059 } break;
7060 case 16: {
7061 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
7062 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7063 } break;
7064 }
7065 %}
7066 ins_pipe( pipe_slow );
7067 %}
7068
7069 instruct vnotB_reg(vec dst, vec src, rRegP scratch) %{
7070 predicate(UseAVX > 0);
7071 match(Set dst (NotV src));
7072 effect(TEMP scratch);
7073 format %{ "vector_not $dst,$src\t! using $scratch as rRegP" %}
7074 ins_encode %{
7075 int vlen_enc = vector_length_encoding(this);
7076 __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vlen_enc, $scratch$$Register);
7077 %}
7078 ins_pipe( pipe_slow );
7079 %}
7080
7081 //------------------------------------- VectorTest --------------------------------------------
7082
7083 #ifdef _LP64
7084 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7085 predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7086 match(Set dst (VectorTest src1 src2 ));
7087 effect(KILL cr);
7088 format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7089 ins_encode %{
7090 int vlen = vector_length_in_bytes(this, $src1);
7091 int vlen_enc = vector_length_encoding(vlen);
7092 if (vlen <= 32) {
7093 if (UseAVX == 0) {
7094 assert(vlen <= 16, "required");
7095 __ ptest($src1$$XMMRegister, $src2$$XMMRegister);
7096 } else {
7097 __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7098 }
7099 } else {
7100 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
7101 __ evpcmpeqb(ktmp, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7102 __ kortestql(ktmp, ktmp);
7103 }
7104 __ setb(Assembler::carrySet, $dst$$Register);
7105 __ movzbl($dst$$Register, $dst$$Register);
7106 %}
7107 ins_pipe( pipe_slow );
7108 %}
7109
7110 instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7111 predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7112 match(Set dst (VectorTest src1 src2 ));
7113 effect(KILL cr);
7114 format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7115 ins_encode %{
7116 int vlen = vector_length_in_bytes(this, $src1);
7117 int vlen_enc = vector_length_encoding(vlen);
7118 if (vlen <= 32) {
7119 if (UseAVX == 0) {
7120 assert(vlen <= 16, "required");
7121 __ ptest($src1$$XMMRegister, $src2$$XMMRegister);
7122 } else {
7123 __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7124 }
7125 } else {
7126 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
7127 __ evpcmpeqb(ktmp, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7128 __ ktestql(ktmp, ktmp);
7129 }
7130 __ setb(Assembler::notZero, $dst$$Register);
7131 __ movzbl($dst$$Register, $dst$$Register);
7132 %}
7133 ins_pipe( pipe_slow );
7134 %}
7135 #endif
7136
7137 //------------------------------------- LoadMask --------------------------------------------
7138
7139 instruct loadMask(vec dst, vec src) %{
7140 match(Set dst (VectorLoadMask src));
7141 effect(TEMP dst);
7142 format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7143 ins_encode %{
7144 int vlen_in_bytes = vector_length_in_bytes(this);
7145 BasicType elem_bt = vector_element_basic_type(this);
7146
7147 __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt);
7148 %}
7149 ins_pipe( pipe_slow );
7150 %}
7151
7152 //------------------------------------- StoreMask --------------------------------------------
7153
7154 instruct storeMask1B(vec dst, vec src, immI_1 size) %{
7155 predicate(vector_length(n) < 64 || VM_Version::supports_avx512vlbw());
7156 match(Set dst (VectorStoreMask src size));
7157 format %{ "vector_store_mask $dst,$src\t!" %}
7158 ins_encode %{
7159 assert(UseSSE >= 3, "required");
7160 if (vector_length_in_bytes(this) <= 16) {
7161 __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7162 } else {
7163 assert(UseAVX >= 2, "required");
7164 int src_vlen_enc = vector_length_encoding(this, $src);
7165 __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7166 }
7167 %}
7168 ins_pipe( pipe_slow );
7169 %}
7170
7171 instruct storeMask2B(vec dst, vec src, immI_2 size) %{
7172 predicate(vector_length(n) <= 8);
7173 match(Set dst (VectorStoreMask src size));
7174 format %{ "vector_store_mask $dst,$src\n\t" %}
7175 ins_encode %{
7176 assert(UseSSE >= 3, "required");
7177 __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7178 __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7179 %}
7180 ins_pipe( pipe_slow );
7181 %}
7182
7183 instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{
7184 predicate(vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7185 match(Set dst (VectorStoreMask src size));
7186 effect(TEMP dst);
7187 format %{ "vector_store_mask $dst,$src\t!" %}
7188 ins_encode %{
7189 int vlen_enc = Assembler::AVX_128bit;
7190 __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7191 __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc);
7192 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7193 %}
7194 ins_pipe( pipe_slow );
7195 %}
7196
7197 instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{
7198 predicate(VM_Version::supports_avx512bw());
7199 match(Set dst (VectorStoreMask src size));
7200 format %{ "vector_store_mask $dst,$src\t!" %}
7201 ins_encode %{
7202 int src_vlen_enc = vector_length_encoding(this, $src);
7203 int dst_vlen_enc = vector_length_encoding(this);
7204 __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7205 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7206 %}
7207 ins_pipe( pipe_slow );
7208 %}
7209
7210 instruct storeMask4B(vec dst, vec src, immI_4 size) %{
7211 predicate (vector_length(n) <= 4 && UseAVX <= 2);
7212 match(Set dst (VectorStoreMask src size));
7213 format %{ "vector_store_mask $dst,$src\t!" %}
7214 ins_encode %{
7215 assert(UseSSE >= 3, "required");
7216 __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7217 __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7218 __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7219 %}
7220 ins_pipe( pipe_slow );
7221 %}
7222
7223 instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{
7224 predicate(vector_length(n) == 8 && UseAVX <= 2);
7225 match(Set dst (VectorStoreMask src size));
7226 format %{ "vector_store_mask $dst,$src\t!" %}
7227 effect(TEMP dst);
7228 ins_encode %{
7229 int vlen_enc = Assembler::AVX_128bit;
7230 __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7231 __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7232 __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7233 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7234 %}
7235 ins_pipe( pipe_slow );
7236 %}
7237
7238 instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{
7239 predicate(UseAVX > 2);
7240 match(Set dst (VectorStoreMask src size));
7241 format %{ "vector_store_mask $dst,$src\t!" %}
7242 ins_encode %{
7243 int src_vlen_enc = vector_length_encoding(this, $src);
7244 int dst_vlen_enc = vector_length_encoding(this);
7245 if (!VM_Version::supports_avx512vl()) {
7246 src_vlen_enc = Assembler::AVX_512bit;
7247 }
7248 __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7249 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7250 %}
7251 ins_pipe( pipe_slow );
7252 %}
7253
7254 instruct storeMask8B(vec dst, vec src, immI_8 size) %{
7255 predicate(vector_length(n) == 2 && UseAVX <= 2);
7256 match(Set dst (VectorStoreMask src size));
7257 format %{ "vector_store_mask $dst,$src\t!" %}
7258 ins_encode %{
7259 assert(UseSSE >= 3, "required");
7260 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7261 __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7262 __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7263 __ pabsb($dst$$XMMRegister, $dst$$XMMRegister);
7264 %}
7265 ins_pipe( pipe_slow );
7266 %}
7267
7268 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{
7269 predicate(vector_length(n) == 4 && UseAVX <= 2);
7270 match(Set dst (VectorStoreMask src size));
7271 format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %}
7272 effect(TEMP dst, TEMP vtmp);
7273 ins_encode %{
7274 int vlen_enc = Assembler::AVX_128bit;
7275 __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7276 __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7277 __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7278 __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7279 __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7280 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7281 %}
7282 ins_pipe( pipe_slow );
7283 %}
7284
7285 instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{
7286 predicate(UseAVX > 2);
7287 match(Set dst (VectorStoreMask src size));
7288 format %{ "vector_store_mask $dst,$src\t!" %}
7289 ins_encode %{
7290 int src_vlen_enc = vector_length_encoding(this, $src);
7291 int dst_vlen_enc = vector_length_encoding(this);
7292 if (!VM_Version::supports_avx512vl()) {
7293 src_vlen_enc = Assembler::AVX_512bit;
7294 }
7295 __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7296 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7297 %}
7298 ins_pipe( pipe_slow );
7299 %}
7300
7301 //-------------------------------- Load Iota Indices ----------------------------------
7302
7303 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
7304 predicate(vector_element_basic_type(n) == T_BYTE);
7305 match(Set dst (VectorLoadConst src));
7306 effect(TEMP scratch);
7307 format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
7308 ins_encode %{
7309 int vlen_in_bytes = vector_length_in_bytes(this);
7310 __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
7311 %}
7312 ins_pipe( pipe_slow );
7313 %}
7314
7315 //-------------------------------- Rearrange ----------------------------------
7316
7317 // LoadShuffle/Rearrange for Byte
7318
7319 instruct loadShuffleB(vec dst) %{
7320 predicate(vector_element_basic_type(n) == T_BYTE);
7321 match(Set dst (VectorLoadShuffle dst));
7322 format %{ "vector_load_shuffle $dst, $dst" %}
7323 ins_encode %{
7324 // empty
7325 %}
7326 ins_pipe( pipe_slow );
7327 %}
7328
7329 instruct rearrangeB(vec dst, vec shuffle) %{
7330 predicate(vector_element_basic_type(n) == T_BYTE &&
7331 vector_length(n) < 32);
7332 match(Set dst (VectorRearrange dst shuffle));
7333 format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7334 ins_encode %{
7335 assert(UseSSE >= 4, "required");
7336 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7337 %}
7338 ins_pipe( pipe_slow );
7339 %}
7340
7341 instruct rearrangeB_avx(vec dst, vec src, vec shuffle) %{
7342 predicate(vector_element_basic_type(n) == T_BYTE &&
7343 vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
7344 match(Set dst (VectorRearrange src shuffle));
7345 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7346 ins_encode %{
7347 __ vpshufb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, Assembler::AVX_256bit);
7348 %}
7349 ins_pipe( pipe_slow );
7350 %}
7351
7352 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
7353 predicate(vector_element_basic_type(n) == T_BYTE &&
7354 vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
7355 match(Set dst (VectorRearrange src shuffle));
7356 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7357 ins_encode %{
7358 int vlen_enc = vector_length_encoding(this);
7359 __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7360 %}
7361 ins_pipe( pipe_slow );
7362 %}
7363
7364 // LoadShuffle/Rearrange for Short
7365
7366 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
7367 predicate(vector_element_basic_type(n) == T_SHORT &&
7368 vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7369 match(Set dst (VectorLoadShuffle src));
7370 effect(TEMP dst, TEMP vtmp, TEMP scratch);
7371 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7372 ins_encode %{
7373 // Create a byte shuffle mask from short shuffle mask
7374 // only byte shuffle instruction available on these platforms
7375
7376 // Multiply each shuffle by two to get byte index
7377 __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7378 __ psllw($vtmp$$XMMRegister, 1);
7379
7380 // Duplicate to create 2 copies of byte index
7381 __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7382 __ psllw($dst$$XMMRegister, 8);
7383 __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7384
7385 // Add one to get alternate byte index
7386 __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7387 __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7388 %}
7389 ins_pipe( pipe_slow );
7390 %}
7391
7392 instruct rearrangeS(vec dst, vec shuffle) %{
7393 predicate(vector_element_basic_type(n) == T_SHORT &&
7394 vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
7395 match(Set dst (VectorRearrange dst shuffle));
7396 format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7397 ins_encode %{
7398 assert(UseSSE >= 4, "required");
7399 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7400 %}
7401 ins_pipe( pipe_slow );
7402 %}
7403
7404 instruct loadShuffleS_evex(vec dst, vec src) %{
7405 predicate(vector_element_basic_type(n) == T_SHORT &&
7406 VM_Version::supports_avx512bw());
7407 match(Set dst (VectorLoadShuffle src));
7408 format %{ "vector_load_shuffle $dst, $src" %}
7409 ins_encode %{
7410 int vlen_enc = vector_length_encoding(this);
7411 if (!VM_Version::supports_avx512vl()) {
7412 vlen_enc = Assembler::AVX_512bit;
7413 }
7414 __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7415 %}
7416 ins_pipe( pipe_slow );
7417 %}
7418
7419 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
7420 predicate(vector_element_basic_type(n) == T_SHORT &&
7421 VM_Version::supports_avx512bw());
7422 match(Set dst (VectorRearrange src shuffle));
7423 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7424 ins_encode %{
7425 int vlen_enc = vector_length_encoding(this);
7426 if (!VM_Version::supports_avx512vl()) {
7427 vlen_enc = Assembler::AVX_512bit;
7428 }
7429 __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7430 %}
7431 ins_pipe( pipe_slow );
7432 %}
7433
7434 // LoadShuffle/Rearrange for Integer and Float
7435
7436 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
7437 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7438 vector_length(n) == 4 && UseAVX < 2);
7439 match(Set dst (VectorLoadShuffle src));
7440 effect(TEMP dst, TEMP vtmp, TEMP scratch);
7441 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7442 ins_encode %{
7443 assert(UseSSE >= 4, "required");
7444
7445 // Create a byte shuffle mask from int shuffle mask
7446 // only byte shuffle instruction available on these platforms
7447
7448 // Duplicate and multiply each shuffle by 4
7449 __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
7450 __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7451 __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7452 __ psllw($vtmp$$XMMRegister, 2);
7453
7454 // Duplicate again to create 4 copies of byte index
7455 __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7456 __ psllw($dst$$XMMRegister, 8);
7457 __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
7458
7459 // Add 3,2,1,0 to get alternate byte index
7460 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
7461 __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7462 %}
7463 ins_pipe( pipe_slow );
7464 %}
7465
7466 instruct rearrangeI(vec dst, vec shuffle) %{
7467 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7468 vector_length(n) == 4 && UseAVX < 2);
7469 match(Set dst (VectorRearrange dst shuffle));
7470 format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7471 ins_encode %{
7472 assert(UseSSE >= 4, "required");
7473 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7474 %}
7475 ins_pipe( pipe_slow );
7476 %}
7477
7478 instruct loadShuffleI_avx(vec dst, vec src) %{
7479 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7480 UseAVX >= 2);
7481 match(Set dst (VectorLoadShuffle src));
7482 format %{ "vector_load_shuffle $dst, $src" %}
7483 ins_encode %{
7484 int vlen_enc = vector_length_encoding(this);
7485 __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7486 %}
7487 ins_pipe( pipe_slow );
7488 %}
7489
7490 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
7491 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7492 UseAVX >= 2);
7493 match(Set dst (VectorRearrange src shuffle));
7494 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7495 ins_encode %{
7496 int vlen_enc = vector_length_encoding(this);
7497 if (vlen_enc == Assembler::AVX_128bit) {
7498 vlen_enc = Assembler::AVX_256bit;
7499 }
7500 __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7501 %}
7502 ins_pipe( pipe_slow );
7503 %}
7504
7505 // LoadShuffle/Rearrange for Long and Double
7506
7507 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
7508 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7509 vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7510 match(Set dst (VectorLoadShuffle src));
7511 effect(TEMP dst, TEMP vtmp, TEMP scratch);
7512 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7513 ins_encode %{
7514 assert(UseAVX >= 2, "required");
7515
7516 int vlen_enc = vector_length_encoding(this);
7517 // Create a double word shuffle mask from long shuffle mask
7518 // only double word shuffle instruction available on these platforms
7519
7520 // Multiply each shuffle by two to get double word index
7521 __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7522 __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7523
7524 // Duplicate each double word shuffle
7525 __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
7526 __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7527
7528 // Add one to get alternate double word index
7529 __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
7530 %}
7531 ins_pipe( pipe_slow );
7532 %}
7533
7534 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
7535 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7536 vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7537 match(Set dst (VectorRearrange src shuffle));
7538 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7539 ins_encode %{
7540 assert(UseAVX >= 2, "required");
7541
7542 int vlen_enc = vector_length_encoding(this);
7543 __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7544 %}
7545 ins_pipe( pipe_slow );
7546 %}
7547
7548 instruct loadShuffleL_evex(vec dst, vec src) %{
7549 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7550 (vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7551 match(Set dst (VectorLoadShuffle src));
7552 format %{ "vector_load_shuffle $dst, $src" %}
7553 ins_encode %{
7554 assert(UseAVX > 2, "required");
7555
7556 int vlen_enc = vector_length_encoding(this);
7557 __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7558 %}
7559 ins_pipe( pipe_slow );
7560 %}
7561
7562 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
7563 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7564 (vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7565 match(Set dst (VectorRearrange src shuffle));
7566 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7567 ins_encode %{
7568 assert(UseAVX > 2, "required");
7569
7570 int vlen_enc = vector_length_encoding(this);
7571 if (vlen_enc == Assembler::AVX_128bit) {
7572 vlen_enc = Assembler::AVX_256bit;
7573 }
7574 __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7575 %}
7576 ins_pipe( pipe_slow );
7577 %}
7578
7579 // --------------------------------- FMA --------------------------------------
7580 // a * b + c
7581
7582 instruct vfmaF_reg(vec a, vec b, vec c) %{
7583 match(Set c (FmaVF c (Binary a b)));
7584 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7585 ins_cost(150);
7586 ins_encode %{
7587 assert(UseFMA, "not enabled");
7588 int vlen_enc = vector_length_encoding(this);
7589 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
7590 %}
7591 ins_pipe( pipe_slow );
7592 %}
7593
7594 instruct vfmaF_mem(vec a, memory b, vec c) %{
7595 match(Set c (FmaVF c (Binary a (LoadVector b))));
7596 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7597 ins_cost(150);
7598 ins_encode %{
7599 assert(UseFMA, "not enabled");
7600 int vlen_enc = vector_length_encoding(this);
7601 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
7602 %}
7603 ins_pipe( pipe_slow );
7604 %}
7605
7606 instruct vfmaD_reg(vec a, vec b, vec c) %{
7607 match(Set c (FmaVD c (Binary a b)));
7608 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
7609 ins_cost(150);
7610 ins_encode %{
7611 assert(UseFMA, "not enabled");
7612 int vlen_enc = vector_length_encoding(this);
7613 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
7614 %}
7615 ins_pipe( pipe_slow );
7616 %}
7617
7618 instruct vfmaD_mem(vec a, memory b, vec c) %{
7619 match(Set c (FmaVD c (Binary a (LoadVector b))));
7620 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
7621 ins_cost(150);
7622 ins_encode %{
7623 assert(UseFMA, "not enabled");
7624 int vlen_enc = vector_length_encoding(this);
7625 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
7626 %}
7627 ins_pipe( pipe_slow );
7628 %}
7629
7630 // --------------------------------- Vector Multiply Add --------------------------------------
7631
7632 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
7633 predicate(UseAVX == 0);
7634 match(Set dst (MulAddVS2VI dst src1));
7635 format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
7636 ins_encode %{
7637 __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
7638 %}
7639 ins_pipe( pipe_slow );
7640 %}
7641
7642 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
7643 predicate(UseAVX > 0);
7644 match(Set dst (MulAddVS2VI src1 src2));
7645 format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
7646 ins_encode %{
7647 int vlen_enc = vector_length_encoding(this);
7648 __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7649 %}
7650 ins_pipe( pipe_slow );
7651 %}
7652
7653 // --------------------------------- Vector Multiply Add Add ----------------------------------
7654
7655 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
7656 predicate(VM_Version::supports_avx512_vnni());
7657 match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
7658 format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
7659 ins_encode %{
7660 assert(UseAVX > 2, "required");
7661 int vlen_enc = vector_length_encoding(this);
7662 __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7663 %}
7664 ins_pipe( pipe_slow );
7665 ins_cost(10);
7666 %}
7667
7668 // --------------------------------- PopCount --------------------------------------
7669
7670 instruct vpopcountI(vec dst, vec src) %{
7671 match(Set dst (PopCountVI src));
7672 format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
7673 ins_encode %{
7674 assert(UsePopCountInstruction, "not enabled");
7675
7676 int vlen_enc = vector_length_encoding(this);
7677 __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7678 %}
7679 ins_pipe( pipe_slow );
7680 %}
7681
|