1080 XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p,
1081 XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p,
1082 XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p,
1083 XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p,
1084 XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p
1085 #ifdef _LP64
1086 ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p,
1087 XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p,
1088 XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089 XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090 XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091 XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092 XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093 XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095 );
1096
1097 reg_class_dynamic vectorz_reg (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099
1100 %}
1101
1102
1103 //----------SOURCE BLOCK-------------------------------------------------------
1104 // This is a block of C++ code which provides values, functions, and
1105 // definitions necessary in the rest of the architecture description
1106
1107 source_hpp %{
1108 // Header information of the source block.
1109 // Method declarations/definitions which are used outside
1110 // the ad-scope can conveniently be defined here.
1111 //
1112 // To keep related declarations/definitions/uses close together,
1113 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1114
1115 class NativeJump;
1116
1117 class CallStubImpl {
1118
1119 //--------------------------------------------------------------
1148 return NativeJump::instruction_size;
1149 }
1150
1151 #ifdef _LP64
1152 static uint size_deopt_handler() {
1153 // three 5 byte instructions plus one move for unreachable address.
1154 return 15+3;
1155 }
1156 #else
1157 static uint size_deopt_handler() {
1158 // NativeCall instruction size is the same as NativeJump.
1159 // exception handler starts out as jump and can be patched to
1160 // a call be deoptimization. (4932387)
1161 // Note that this value is also credited (in output.cpp) to
1162 // the size of the code section.
1163 return 5 + NativeJump::instruction_size; // pushl(); jmp;
1164 }
1165 #endif
1166 };
1167
1168 class Node::PD {
1169 public:
1170 enum NodeFlags {
1171 Flag_intel_jcc_erratum = Node::_last_flag << 1,
1172 _last_flag = Flag_intel_jcc_erratum
1173 };
1174 };
1175
1176 %} // end source_hpp
1177
1178 source %{
1179
1180 #include "opto/addnode.hpp"
1181 #include "c2_intelJccErratum_x86.hpp"
1182
1183 void PhaseOutput::pd_perform_mach_node_analysis() {
1184 if (VM_Version::has_intel_jcc_erratum()) {
1185 int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1186 _buf_sizes._code += extra_padding;
1187 }
1245 Label next;
1246 // push a "the_pc" on the stack without destroying any registers
1247 // as they all may be live.
1248
1249 // push address of "next"
1250 __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1251 __ bind(next);
1252 // adjust it so it matches "the_pc"
1253 __ subptr(Address(rsp, 0), __ offset() - offset);
1254 #else
1255 InternalAddress here(__ pc());
1256 __ pushptr(here.addr());
1257 #endif
1258
1259 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1260 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1261 __ end_a_stub();
1262 return offset;
1263 }
1264
1265
1266 //=============================================================================
1267
1268 // Float masks come from different places depending on platform.
1269 #ifdef _LP64
1270 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); }
1271 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); }
1272 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1273 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1274 #else
1275 static address float_signmask() { return (address)float_signmask_pool; }
1276 static address float_signflip() { return (address)float_signflip_pool; }
1277 static address double_signmask() { return (address)double_signmask_pool; }
1278 static address double_signflip() { return (address)double_signflip_pool; }
1279 #endif
1280 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1281 static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1282 static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1283
1284 //=============================================================================
1285 const bool Matcher::match_rule_supported(int opcode) {
1286 if (!has_match_rule(opcode)) {
1287 return false; // no match rule present
1288 }
1289 switch (opcode) {
1290 case Op_AbsVL:
1291 if (UseAVX < 3) {
1292 return false;
1293 }
1294 break;
1295 case Op_PopCountI:
1296 case Op_PopCountL:
1297 if (!UsePopCountInstruction) {
1298 return false;
1299 }
1300 break;
1301 case Op_PopCountVI:
1302 if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1303 return false;
1304 }
1305 break;
1306 case Op_MulVI:
1307 if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1308 return false;
1309 }
1310 break;
1311 case Op_MulVL:
1312 case Op_MulReductionVL:
1313 if (VM_Version::supports_avx512dq() == false) {
1314 return false;
1315 }
1316 break;
1317 case Op_AbsVB:
1318 case Op_AbsVS:
1319 case Op_AbsVI:
1320 case Op_AddReductionVI:
1321 case Op_AndReductionV:
1322 case Op_OrReductionV:
1323 case Op_XorReductionV:
1324 if (UseSSE < 3) { // requires at least SSSE3
1325 return false;
1326 }
1327 break;
1328 case Op_MulReductionVI:
1329 if (UseSSE < 4) { // requires at least SSE4
1330 return false;
1331 }
1332 break;
1333 case Op_SqrtVD:
1334 case Op_SqrtVF:
1335 if (UseAVX < 1) { // enabled for AVX only
1336 return false;
1337 }
1338 break;
1339 case Op_CompareAndSwapL:
1340 #ifdef _LP64
1341 case Op_CompareAndSwapP:
1342 #endif
1343 if (!VM_Version::supports_cx8()) {
1344 return false;
1345 }
1346 break;
1347 case Op_CMoveVF:
1348 case Op_CMoveVD:
1349 if (UseAVX < 1 || UseAVX > 2) {
1350 return false;
1351 }
1352 break;
1353 case Op_StrIndexOf:
1354 if (!UseSSE42Intrinsics) {
1355 return false;
1356 }
1357 break;
1358 case Op_StrIndexOfChar:
1359 if (!UseSSE42Intrinsics) {
1360 return false;
1361 }
1362 break;
1363 case Op_OnSpinWait:
1364 if (VM_Version::supports_on_spin_wait() == false) {
1365 return false;
1366 }
1367 break;
1368 case Op_MulVB:
1369 case Op_LShiftVB:
1370 case Op_RShiftVB:
1371 case Op_URShiftVB:
1372 if (UseSSE < 4) {
1373 return false;
1374 }
1375 break;
1376 #ifdef _LP64
1377 case Op_MaxD:
1378 case Op_MaxF:
1379 case Op_MinD:
1380 case Op_MinF:
1381 if (UseAVX < 1) { // enabled for AVX only
1382 return false;
1383 }
1384 break;
1385 #endif
1386 case Op_CacheWB:
1387 case Op_CacheWBPreSync:
1388 case Op_CacheWBPostSync:
1389 if (!VM_Version::supports_data_cache_line_flush()) {
1390 return false;
1391 }
1392 break;
1393 case Op_RoundDoubleMode:
1394 if (UseSSE < 4) {
1395 return false;
1396 }
1397 break;
1398 case Op_RoundDoubleModeV:
1399 if (VM_Version::supports_avx() == false) {
1400 return false; // 128bit vroundpd is not available
1401 }
1402 break;
1403 case Op_MacroLogicV:
1404 if (UseAVX < 3 || !UseVectorMacroLogic) {
1405 return false;
1406 }
1407 break;
1408 #ifndef _LP64
1409 case Op_AddReductionVF:
1410 case Op_AddReductionVD:
1411 case Op_MulReductionVF:
1412 case Op_MulReductionVD:
1413 if (UseSSE < 1) { // requires at least SSE
1414 return false;
1415 }
1416 break;
1417 case Op_MulAddVS2VI:
1418 case Op_RShiftVL:
1419 case Op_AbsVD:
1420 case Op_NegVD:
1421 if (UseSSE < 2) {
1422 return false;
1443 // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1444 // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1445 // And MaxVectorSize is taken into account as well.
1446 if (!vector_size_supported(bt, vlen)) {
1447 return false;
1448 }
1449 // Special cases which require vector length follow:
1450 // * implementation limitations
1451 // * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1452 // * 128bit vroundpd instruction is present only in AVX1
1453 int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1454 switch (opcode) {
1455 case Op_AbsVF:
1456 case Op_NegVF:
1457 if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1458 return false; // 512bit vandps and vxorps are not available
1459 }
1460 break;
1461 case Op_AbsVD:
1462 case Op_NegVD:
1463 if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1464 return false; // 512bit vandpd and vxorpd are not available
1465 }
1466 break;
1467 case Op_CMoveVF:
1468 if (vlen != 8) {
1469 return false; // implementation limitation (only vcmov8F_reg is present)
1470 }
1471 break;
1472 case Op_MacroLogicV:
1473 if (!VM_Version::supports_evex() ||
1474 ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1475 return false;
1476 }
1477 break;
1478 case Op_CMoveVD:
1479 if (vlen != 4) {
1480 return false; // implementation limitation (only vcmov4D_reg is present)
1481 }
1482 break;
1483 }
1484 return true; // Per default match rules are supported.
1485 }
1486
1487 // x86 supports generic vector operands: vec and legVec.
1488 const bool Matcher::supports_generic_vector_operands = true;
1489
1490 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1491 assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1492 bool legacy = (generic_opnd->opcode() == LEGVEC);
1493 if (!VM_Version::supports_avx512vlbwdq() && // KNL
1494 is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1495 // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1496 return new legVecZOper();
1497 }
1498 if (legacy) {
1499 switch (ideal_reg) {
1500 case Op_VecS: return new legVecSOper();
1501 case Op_VecD: return new legVecDOper();
1502 case Op_VecX: return new legVecXOper();
1521 case MoveVec2Leg_rule:
1522 case MoveLeg2Vec_rule:
1523 return true;
1524 default:
1525 return false;
1526 }
1527 }
1528
1529 bool Matcher::is_generic_vector(MachOper* opnd) {
1530 switch (opnd->opcode()) {
1531 case VEC:
1532 case LEGVEC:
1533 return true;
1534 default:
1535 return false;
1536 }
1537 }
1538
1539 //------------------------------------------------------------------------
1540
1541 const bool Matcher::has_predicated_vectors(void) {
1542 bool ret_value = false;
1543 if (UseAVX > 2) {
1544 ret_value = VM_Version::supports_avx512vl();
1545 }
1546
1547 return ret_value;
1548 }
1549
1550 const int Matcher::float_pressure(int default_pressure_threshold) {
1551 int float_pressure_threshold = default_pressure_threshold;
1552 #ifdef _LP64
1553 if (UseAVX > 2) {
1554 // Increase pressure threshold on machines with AVX3 which have
1555 // 2x more XMM registers.
1556 float_pressure_threshold = default_pressure_threshold * 2;
1557 }
1558 #endif
1559 return float_pressure_threshold;
1560 }
1804 } else {
1805 mstack.push(adr, Pre_Visit);
1806 }
1807
1808 // Clone X+offset as it also folds into most addressing expressions
1809 mstack.push(off, Visit);
1810 mstack.push(m->in(AddPNode::Base), Pre_Visit);
1811 return true;
1812 } else if (clone_shift(off, this, mstack, address_visited)) {
1813 address_visited.test_set(m->_idx); // Flag as address_visited
1814 mstack.push(m->in(AddPNode::Address), Pre_Visit);
1815 mstack.push(m->in(AddPNode::Base), Pre_Visit);
1816 return true;
1817 }
1818 return false;
1819 }
1820
1821 void Compile::reshape_address(AddPNode* addp) {
1822 }
1823
1824 static inline uint vector_length(const MachNode* n) {
1825 const TypeVect* vt = n->bottom_type()->is_vect();
1826 return vt->length();
1827 }
1828
1829 static inline uint vector_length(const MachNode* use, MachOper* opnd) {
1830 uint def_idx = use->operand_index(opnd);
1831 Node* def = use->in(def_idx);
1832 return def->bottom_type()->is_vect()->length();
1833 }
1834
1835 static inline uint vector_length_in_bytes(const MachNode* n) {
1836 const TypeVect* vt = n->bottom_type()->is_vect();
1837 return vt->length_in_bytes();
1838 }
1839
1840 static inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1841 uint def_idx = use->operand_index(opnd);
1842 Node* def = use->in(def_idx);
1843 return def->bottom_type()->is_vect()->length_in_bytes();
1844 }
1845
1846 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* n) {
1847 switch(vector_length_in_bytes(n)) {
1848 case 4: // fall-through
1849 case 8: // fall-through
1850 case 16: return Assembler::AVX_128bit;
1851 case 32: return Assembler::AVX_256bit;
1852 case 64: return Assembler::AVX_512bit;
1853
1854 default: {
1855 ShouldNotReachHere();
1856 return Assembler::AVX_NoVec;
1857 }
1858 }
1859 }
1860
1861 // Helper methods for MachSpillCopyNode::implementation().
1862 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1863 int src_hi, int dst_hi, uint ireg, outputStream* st) {
1864 // In 64-bit VM size calculation is very complex. Emitting instructions
1865 // into scratch buffer is used to get size in 64-bit VM.
1866 LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1867 assert(ireg == Op_VecS || // 32bit vector
1868 (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1869 (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1870 "no non-adjacent vector moves" );
1871 if (cbuf) {
1872 C2_MacroAssembler _masm(cbuf);
1873 int offset = __ offset();
1874 switch (ireg) {
1875 case Op_VecS: // copy whole register
1876 case Op_VecD:
1877 case Op_VecX:
2164 %}
2165
2166 encode %{
2167
2168 enc_class call_epilog %{
2169 if (VerifyStackAtCalls) {
2170 // Check that stack depth is unchanged: find majik cookie on stack
2171 int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2172 C2_MacroAssembler _masm(&cbuf);
2173 Label L;
2174 __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2175 __ jccb(Assembler::equal, L);
2176 // Die if stack mismatch
2177 __ int3();
2178 __ bind(L);
2179 }
2180 %}
2181
2182 %}
2183
2184
2185 //----------OPERANDS-----------------------------------------------------------
2186 // Operand definitions must precede instruction definitions for correct parsing
2187 // in the ADLC because operands constitute user defined types which are used in
2188 // instruction definitions.
2189
2190 // Vectors
2191
2192 // Dummy generic vector class. Should be used for all vector operands.
2193 // Replaced with vec[SDXYZ] during post-selection pass.
2194 operand vec() %{
2195 constraint(ALLOC_IN_RC(dynamic));
2196 match(VecX);
2197 match(VecY);
2198 match(VecZ);
2199 match(VecS);
2200 match(VecD);
2201
2202 format %{ %}
2203 interface(REG_INTER);
2930 ins_pipe(pipe_slow);
2931 %}
2932
2933 instruct absF_reg(regF dst) %{
2934 predicate((UseSSE>=1) && (UseAVX == 0));
2935 match(Set dst (AbsF dst));
2936 ins_cost(150);
2937 format %{ "andps $dst, [0x7fffffff]\t# abs float by sign masking" %}
2938 ins_encode %{
2939 __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2940 %}
2941 ins_pipe(pipe_slow);
2942 %}
2943
2944 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2945 predicate(UseAVX > 0);
2946 match(Set dst (AbsF src));
2947 ins_cost(150);
2948 format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2949 ins_encode %{
2950 int vector_len = 0;
2951 __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2952 ExternalAddress(float_signmask()), vector_len);
2953 %}
2954 ins_pipe(pipe_slow);
2955 %}
2956
2957 instruct absD_reg(regD dst) %{
2958 predicate((UseSSE>=2) && (UseAVX == 0));
2959 match(Set dst (AbsD dst));
2960 ins_cost(150);
2961 format %{ "andpd $dst, [0x7fffffffffffffff]\t"
2962 "# abs double by sign masking" %}
2963 ins_encode %{
2964 __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2965 %}
2966 ins_pipe(pipe_slow);
2967 %}
2968
2969 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2970 predicate(UseAVX > 0);
2971 match(Set dst (AbsD src));
2972 ins_cost(150);
2973 format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t"
2974 "# abs double by sign masking" %}
2975 ins_encode %{
2976 int vector_len = 0;
2977 __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2978 ExternalAddress(double_signmask()), vector_len);
2979 %}
2980 ins_pipe(pipe_slow);
2981 %}
2982
2983 instruct negF_reg(regF dst) %{
2984 predicate((UseSSE>=1) && (UseAVX == 0));
2985 match(Set dst (NegF dst));
2986 ins_cost(150);
2987 format %{ "xorps $dst, [0x80000000]\t# neg float by sign flipping" %}
2988 ins_encode %{
2989 __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2990 %}
2991 ins_pipe(pipe_slow);
2992 %}
2993
2994 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2995 predicate(UseAVX > 0);
2996 match(Set dst (NegF src));
2997 ins_cost(150);
2998 format %{ "vnegatess $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3082
3083 format %{ "sqrtsd $dst, $src" %}
3084 ins_cost(150);
3085 ins_encode %{
3086 __ sqrtsd($dst$$XMMRegister, $src$$Address);
3087 %}
3088 ins_pipe(pipe_slow);
3089 %}
3090
3091 instruct sqrtD_imm(regD dst, immD con) %{
3092 predicate(UseSSE>=2);
3093 match(Set dst (SqrtD con));
3094 format %{ "sqrtsd $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3095 ins_cost(150);
3096 ins_encode %{
3097 __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3098 %}
3099 ins_pipe(pipe_slow);
3100 %}
3101
3102
3103 #ifdef _LP64
3104 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3105 match(Set dst (RoundDoubleMode src rmode));
3106 format %{ "roundsd $dst,$src" %}
3107 ins_cost(150);
3108 ins_encode %{
3109 assert(UseSSE >= 4, "required");
3110 __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3111 %}
3112 ins_pipe(pipe_slow);
3113 %}
3114
3115 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3116 match(Set dst (RoundDoubleMode (LoadD src) rmode));
3117 format %{ "roundsd $dst,$src" %}
3118 ins_cost(150);
3119 ins_encode %{
3120 assert(UseSSE >= 4, "required");
3121 __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3122 %}
3123 ins_pipe(pipe_slow);
3124 %}
3125
3126 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3127 match(Set dst (RoundDoubleMode con rmode));
3128 effect(TEMP scratch_reg);
3129 format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3130 ins_cost(150);
3131 ins_encode %{
3132 assert(UseSSE >= 4, "required");
3133 __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3134 %}
3135 ins_pipe(pipe_slow);
3136 %}
3137
3138 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3139 predicate(n->as_Vector()->length() < 8);
3140 match(Set dst (RoundDoubleModeV src rmode));
3141 format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3142 ins_encode %{
3143 assert(UseAVX > 0, "required");
3144 int vector_len = vector_length_encoding(this);
3145 __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vector_len);
3146 %}
3147 ins_pipe( pipe_slow );
3148 %}
3149
3150 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3151 predicate(n->as_Vector()->length() == 8);
3152 match(Set dst (RoundDoubleModeV src rmode));
3153 format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3154 ins_encode %{
3155 assert(UseAVX > 2, "required");
3156 __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3157 %}
3158 ins_pipe( pipe_slow );
3159 %}
3160
3161 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3162 predicate(n->as_Vector()->length() < 8);
3163 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3164 format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3165 ins_encode %{
3166 assert(UseAVX > 0, "required");
3167 int vector_len = vector_length_encoding(this);
3168 __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vector_len);
3169 %}
3170 ins_pipe( pipe_slow );
3171 %}
3172
3173 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3174 predicate(n->as_Vector()->length() == 8);
3175 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3176 format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3177 ins_encode %{
3178 assert(UseAVX > 2, "required");
3179 __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3180 %}
3181 ins_pipe( pipe_slow );
3182 %}
3183 #endif // _LP64
3184
3185 instruct onspinwait() %{
3186 match(OnSpinWait);
3187 ins_cost(200);
3188
3189 format %{
3190 $$template
3191 $$emit$$"pause\t! membar_onspinwait"
3192 %}
3193 ins_encode %{
3194 __ pause();
3226 instruct MoveVec2Leg(legVec dst, vec src) %{
3227 match(Set dst src);
3228 format %{ "" %}
3229 ins_encode %{
3230 ShouldNotReachHere();
3231 %}
3232 ins_pipe( fpu_reg_reg );
3233 %}
3234
3235 instruct MoveLeg2Vec(vec dst, legVec src) %{
3236 match(Set dst src);
3237 format %{ "" %}
3238 ins_encode %{
3239 ShouldNotReachHere();
3240 %}
3241 ins_pipe( fpu_reg_reg );
3242 %}
3243
3244 // ============================================================================
3245
3246 // Load vectors
3247 instruct loadV(vec dst, memory mem) %{
3248 match(Set dst (LoadVector mem));
3249 ins_cost(125);
3250 format %{ "load_vector $dst,$mem" %}
3251 ins_encode %{
3252 switch (vector_length_in_bytes(this)) {
3253 case 4: __ movdl ($dst$$XMMRegister, $mem$$Address); break;
3254 case 8: __ movq ($dst$$XMMRegister, $mem$$Address); break;
3255 case 16: __ movdqu ($dst$$XMMRegister, $mem$$Address); break;
3256 case 32: __ vmovdqu ($dst$$XMMRegister, $mem$$Address); break;
3257 case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3258 default: ShouldNotReachHere();
3259 }
3260 %}
3261 ins_pipe( pipe_slow );
3262 %}
3263
3264 // Store vectors generic operand pattern.
3265 instruct storeV(memory mem, vec src) %{
3266 match(Set mem (StoreVector mem src));
3267 ins_cost(145);
3268 format %{ "store_vector $mem,$src\n\t" %}
3269 ins_encode %{
3270 switch (vector_length_in_bytes(this, $src)) {
3271 case 4: __ movdl ($mem$$Address, $src$$XMMRegister); break;
3272 case 8: __ movq ($mem$$Address, $src$$XMMRegister); break;
3273 case 16: __ movdqu ($mem$$Address, $src$$XMMRegister); break;
3274 case 32: __ vmovdqu ($mem$$Address, $src$$XMMRegister); break;
3275 case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3276 default: ShouldNotReachHere();
3277 }
3278 %}
3279 ins_pipe( pipe_slow );
3280 %}
3281
3282 // ====================REPLICATE=======================================
3283
3284 // Replicate byte scalar to be vector
3285 instruct ReplB_reg(vec dst, rRegI src) %{
3286 match(Set dst (ReplicateB src));
3287 format %{ "replicateB $dst,$src" %}
3288 ins_encode %{
3289 uint vlen = vector_length(this);
3290 if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3291 assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3292 int vlen_enc = vector_length_encoding(this);
3293 __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3294 } else {
3295 __ movdl($dst$$XMMRegister, $src$$Register);
3296 __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3297 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3298 if (vlen >= 16) {
3299 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3300 if (vlen >= 32) {
3301 assert(vlen == 32, "sanity");
3302 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3303 }
3304 }
3305 }
3306 %}
3307 ins_pipe( pipe_slow );
3308 %}
3309
3310 instruct ReplB_mem(vec dst, memory mem) %{
3311 predicate(VM_Version::supports_avx2());
3312 match(Set dst (ReplicateB (LoadB mem)));
3313 format %{ "replicateB $dst,$mem" %}
3314 ins_encode %{
3315 int vector_len = vector_length_encoding(this);
3316 __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3317 %}
3318 ins_pipe( pipe_slow );
3319 %}
3320
3321 instruct ReplB_imm(vec dst, immI con) %{
3322 match(Set dst (ReplicateB con));
3323 format %{ "replicateB $dst,$con" %}
3324 ins_encode %{
3325 uint vlen = vector_length(this);
3326 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3327 if (vlen == 4) {
3328 __ movdl($dst$$XMMRegister, const_addr);
3329 } else {
3330 __ movq($dst$$XMMRegister, const_addr);
3331 if (vlen >= 16) {
3332 if (VM_Version::supports_avx2()) {
3333 int vlen_enc = vector_length_encoding(this);
3334 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3335 } else {
3336 assert(vlen == 16, "sanity");
3337 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3338 }
3339 }
3340 }
3341 %}
3342 ins_pipe( pipe_slow );
3343 %}
3344
3345 // Replicate byte scalar zero to be vector
3346 instruct ReplB_zero(vec dst, immI0 zero) %{
3347 match(Set dst (ReplicateB zero));
3348 format %{ "replicateB $dst,$zero" %}
3349 ins_encode %{
3350 uint vlen = vector_length(this);
3351 if (vlen <= 16) {
3352 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3353 } else {
3354 // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3355 int vlen_enc = vector_length_encoding(this);
3356 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3357 }
3358 %}
3359 ins_pipe( fpu_reg_reg );
3360 %}
3361
3362 // ====================ReplicateS=======================================
3363
3364 instruct ReplS_reg(vec dst, rRegI src) %{
3365 match(Set dst (ReplicateS src));
3366 format %{ "replicateS $dst,$src" %}
3403 uint vlen = vector_length(this);
3404 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3405 if (vlen == 2) {
3406 __ movdl($dst$$XMMRegister, const_addr);
3407 } else {
3408 __ movq($dst$$XMMRegister, const_addr);
3409 if (vlen >= 8) {
3410 if (VM_Version::supports_avx2()) {
3411 int vlen_enc = vector_length_encoding(this);
3412 __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3413 } else {
3414 assert(vlen == 8, "sanity");
3415 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3416 }
3417 }
3418 }
3419 %}
3420 ins_pipe( fpu_reg_reg );
3421 %}
3422
3423 instruct ReplS_zero(vec dst, immI0 zero) %{
3424 match(Set dst (ReplicateS zero));
3425 format %{ "replicateS $dst,$zero" %}
3426 ins_encode %{
3427 uint vlen = vector_length(this);
3428 if (vlen <= 8) {
3429 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3430 } else {
3431 int vlen_enc = vector_length_encoding(this);
3432 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3433 }
3434 %}
3435 ins_pipe( fpu_reg_reg );
3436 %}
3437
3438 // ====================ReplicateI=======================================
3439
3440 instruct ReplI_reg(vec dst, rRegI src) %{
3441 match(Set dst (ReplicateI src));
3442 format %{ "replicateI $dst,$src" %}
3443 ins_encode %{
3450 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3451 if (vlen >= 8) {
3452 assert(vlen == 8, "sanity");
3453 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3454 }
3455 }
3456 %}
3457 ins_pipe( pipe_slow );
3458 %}
3459
3460 instruct ReplI_mem(vec dst, memory mem) %{
3461 match(Set dst (ReplicateI (LoadI mem)));
3462 format %{ "replicateI $dst,$mem" %}
3463 ins_encode %{
3464 uint vlen = vector_length(this);
3465 if (vlen <= 4) {
3466 __ movdl($dst$$XMMRegister, $mem$$Address);
3467 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3468 } else {
3469 assert(VM_Version::supports_avx2(), "sanity");
3470 int vector_len = vector_length_encoding(this);
3471 __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3472 }
3473 %}
3474 ins_pipe( pipe_slow );
3475 %}
3476
3477 instruct ReplI_imm(vec dst, immI con) %{
3478 match(Set dst (ReplicateI con));
3479 format %{ "replicateI $dst,$con" %}
3480 ins_encode %{
3481 uint vlen = vector_length(this);
3482 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3483 if (vlen <= 4) {
3484 __ movq($dst$$XMMRegister, const_addr);
3485 if (vlen == 4) {
3486 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3487 }
3488 } else {
3489 assert(VM_Version::supports_avx2(), "sanity");
3490 int vector_len = vector_length_encoding(this);
3491 __ movq($dst$$XMMRegister, const_addr);
3492 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3493 }
3494 %}
3495 ins_pipe( pipe_slow );
3496 %}
3497
3498 // Replicate integer (4 byte) scalar zero to be vector
3499 instruct ReplI_zero(vec dst, immI0 zero) %{
3500 match(Set dst (ReplicateI zero));
3501 format %{ "replicateI $dst,$zero" %}
3502 ins_encode %{
3503 uint vlen = vector_length(this);
3504 if (vlen <= 4) {
3505 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3506 } else {
3507 int vlen_enc = vector_length_encoding(this);
3508 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3509 }
3510 %}
3511 ins_pipe( fpu_reg_reg );
3512 %}
3513
3514 instruct ReplI_M1(vec dst, immI_M1 con) %{
3515 predicate(UseAVX > 0);
3516 match(Set dst (ReplicateB con));
3517 match(Set dst (ReplicateS con));
3518 match(Set dst (ReplicateI con));
3519 effect(TEMP dst);
3535 ins_encode %{
3536 uint vlen = vector_length(this);
3537 if (vlen == 2) {
3538 __ movdq($dst$$XMMRegister, $src$$Register);
3539 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3540 } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3541 int vlen_enc = vector_length_encoding(this);
3542 __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3543 } else {
3544 assert(vlen == 4, "sanity");
3545 __ movdq($dst$$XMMRegister, $src$$Register);
3546 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3547 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3548 }
3549 %}
3550 ins_pipe( pipe_slow );
3551 %}
3552 #else // _LP64
3553 // Replicate long (8 byte) scalar to be vector
3554 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3555 predicate(n->as_Vector()->length() <= 4);
3556 match(Set dst (ReplicateL src));
3557 effect(TEMP dst, USE src, TEMP tmp);
3558 format %{ "replicateL $dst,$src" %}
3559 ins_encode %{
3560 uint vlen = vector_length(this);
3561 if (vlen == 2) {
3562 __ movdl($dst$$XMMRegister, $src$$Register);
3563 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3564 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3565 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3566 } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3567 int vector_len = Assembler::AVX_256bit;
3568 __ movdl($dst$$XMMRegister, $src$$Register);
3569 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3570 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3571 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3572 } else {
3573 __ movdl($dst$$XMMRegister, $src$$Register);
3574 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3575 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3576 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3577 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3578 }
3579 %}
3580 ins_pipe( pipe_slow );
3581 %}
3582
3583 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3584 predicate(n->as_Vector()->length() == 8);
3585 match(Set dst (ReplicateL src));
3586 effect(TEMP dst, USE src, TEMP tmp);
3587 format %{ "replicateL $dst,$src" %}
3588 ins_encode %{
3589 if (VM_Version::supports_avx512vl()) {
3590 __ movdl($dst$$XMMRegister, $src$$Register);
3591 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3592 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3593 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3594 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3595 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3596 } else {
3597 int vector_len = Assembler::AVX_512bit;
3598 __ movdl($dst$$XMMRegister, $src$$Register);
3599 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3600 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3601 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3602 }
3603 %}
3604 ins_pipe( pipe_slow );
3605 %}
3606 #endif // _LP64
3607
3608 instruct ReplL_mem(vec dst, memory mem) %{
3609 match(Set dst (ReplicateL (LoadL mem)));
3610 format %{ "replicateL $dst,$mem" %}
3611 ins_encode %{
3612 uint vlen = vector_length(this);
3613 if (vlen == 2) {
3614 __ movq($dst$$XMMRegister, $mem$$Address);
3615 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3616 } else {
3617 assert(VM_Version::supports_avx2(), "sanity");
3618 int vlen_enc = vector_length_encoding(this);
3619 __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
3620 }
3621 %}
3662 match(Set dst (ReplicateL con));
3663 effect(TEMP dst);
3664 format %{ "vallones $dst" %}
3665 ins_encode %{
3666 int vector_len = vector_length_encoding(this);
3667 __ vallones($dst$$XMMRegister, vector_len);
3668 %}
3669 ins_pipe( pipe_slow );
3670 %}
3671
3672 // ====================ReplicateF=======================================
3673
3674 instruct ReplF_reg(vec dst, vlRegF src) %{
3675 match(Set dst (ReplicateF src));
3676 format %{ "replicateF $dst,$src" %}
3677 ins_encode %{
3678 uint vlen = vector_length(this);
3679 if (vlen <= 4) {
3680 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3681 } else if (VM_Version::supports_avx2()) {
3682 int vector_len = vector_length_encoding(this);
3683 __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len); // reg-to-reg variant requires AVX2
3684 } else {
3685 assert(vlen == 8, "sanity");
3686 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3687 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3688 }
3689 %}
3690 ins_pipe( pipe_slow );
3691 %}
3692
3693 instruct ReplF_mem(vec dst, memory mem) %{
3694 match(Set dst (ReplicateF (LoadF mem)));
3695 format %{ "replicateF $dst,$mem" %}
3696 ins_encode %{
3697 uint vlen = vector_length(this);
3698 if (vlen <= 4) {
3699 __ movdl($dst$$XMMRegister, $mem$$Address);
3700 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3701 } else {
3702 assert(VM_Version::supports_avx(), "sanity");
3703 int vector_len = vector_length_encoding(this);
3704 __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
3705 }
3706 %}
3707 ins_pipe( pipe_slow );
3708 %}
3709
3710 instruct ReplF_zero(vec dst, immF0 zero) %{
3711 match(Set dst (ReplicateF zero));
3712 format %{ "replicateF $dst,$zero" %}
3713 ins_encode %{
3714 uint vlen = vector_length(this);
3715 if (vlen <= 4) {
3716 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3717 } else {
3718 int vlen_enc = vector_length_encoding(this);
3719 __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
3720 }
3721 %}
3722 ins_pipe( fpu_reg_reg );
3723 %}
3724
3725 // ====================ReplicateD=======================================
3726
3727 // Replicate double (8 bytes) scalar to be vector
3728 instruct ReplD_reg(vec dst, vlRegD src) %{
3729 match(Set dst (ReplicateD src));
3730 format %{ "replicateD $dst,$src" %}
3731 ins_encode %{
3732 uint vlen = vector_length(this);
3733 if (vlen == 2) {
3734 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3735 } else if (VM_Version::supports_avx2()) {
3736 int vector_len = vector_length_encoding(this);
3737 __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); // reg-to-reg variant requires AVX2
3738 } else {
3739 assert(vlen == 4, "sanity");
3740 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3741 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3742 }
3743 %}
3744 ins_pipe( pipe_slow );
3745 %}
3746
3747 instruct ReplD_mem(vec dst, memory mem) %{
3748 match(Set dst (ReplicateD (LoadD mem)));
3749 format %{ "replicateD $dst,$mem" %}
3750 ins_encode %{
3751 uint vlen = vector_length(this);
3752 if (vlen == 2) {
3753 __ movq($dst$$XMMRegister, $mem$$Address);
3754 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
3755 } else {
3756 assert(VM_Version::supports_avx(), "sanity");
3757 int vector_len = vector_length_encoding(this);
3758 __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
3759 }
3760 %}
3761 ins_pipe( pipe_slow );
3762 %}
3763
3764 instruct ReplD_zero(vec dst, immD0 zero) %{
3765 match(Set dst (ReplicateD zero));
3766 format %{ "replicateD $dst,$zero" %}
3767 ins_encode %{
3768 uint vlen = vector_length(this);
3769 if (vlen == 2) {
3770 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3771 } else {
3772 int vlen_enc = vector_length_encoding(this);
3773 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
3774 }
3775 %}
3776 ins_pipe( fpu_reg_reg );
3777 %}
3778
3779 // ====================REDUCTION ARITHMETIC=======================================
3780 // =======================Int Reduction==========================================
3781
3782 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
3783 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
3784 n->in(2)->bottom_type()->is_vect()->length() < 16);
3785 match(Set dst (AddReductionVI src1 src2));
3786 match(Set dst (MulReductionVI src1 src2));
3787 match(Set dst (AndReductionV src1 src2));
3788 match(Set dst ( OrReductionV src1 src2));
3789 match(Set dst (XorReductionV src1 src2));
3790 effect(TEMP vtmp1, TEMP vtmp2);
3791 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3792 ins_encode %{
3793 int opcode = this->ideal_Opcode();
3794 int vlen = vector_length(this, $src2);
3795 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3796 %}
3797 ins_pipe( pipe_slow );
3798 %}
3799
3800 instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
3801 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
3802 n->in(2)->bottom_type()->is_vect()->length() == 16);
3803 match(Set dst (AddReductionVI src1 src2));
3804 match(Set dst (MulReductionVI src1 src2));
3805 match(Set dst (AndReductionV src1 src2));
3806 match(Set dst ( OrReductionV src1 src2));
3807 match(Set dst (XorReductionV src1 src2));
3808 effect(TEMP vtmp1, TEMP vtmp2);
3809 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3810 ins_encode %{
3811 int opcode = this->ideal_Opcode();
3812 int vlen = vector_length(this, $src2);
3813 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3814 %}
3815 ins_pipe( pipe_slow );
3816 %}
3817
3818 // =======================Long Reduction==========================================
3819
3820 #ifdef _LP64
3821 instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
3822 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
3823 n->in(2)->bottom_type()->is_vect()->length() < 8);
3824 match(Set dst (AddReductionVL src1 src2));
3825 match(Set dst (MulReductionVL src1 src2));
3826 match(Set dst (AndReductionV src1 src2));
3827 match(Set dst ( OrReductionV src1 src2));
3828 match(Set dst (XorReductionV src1 src2));
3829 effect(TEMP vtmp1, TEMP vtmp2);
3830 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3831 ins_encode %{
3832 int opcode = this->ideal_Opcode();
3833 int vlen = vector_length(this, $src2);
3834 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3835 %}
3836 ins_pipe( pipe_slow );
3837 %}
3838
3839 instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
3840 predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
3841 n->in(2)->bottom_type()->is_vect()->length() == 8);
3842 match(Set dst (AddReductionVL src1 src2));
3843 match(Set dst (MulReductionVL src1 src2));
3844 match(Set dst (AndReductionV src1 src2));
3845 match(Set dst ( OrReductionV src1 src2));
3846 match(Set dst (XorReductionV src1 src2));
3847 effect(TEMP vtmp1, TEMP vtmp2);
3848 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
3849 ins_encode %{
3850 int opcode = this->ideal_Opcode();
3851 int vlen = vector_length(this, $src2);
3852 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3853 %}
3854 ins_pipe( pipe_slow );
3855 %}
3856 #endif // _LP64
3857
3858 // =======================Float Reduction==========================================
3859
3860 instruct reductionF128(regF dst, vec src, vec vtmp) %{
3861 predicate(n->in(2)->bottom_type()->is_vect()->length() <= 4);
3862 match(Set dst (AddReductionVF dst src));
3863 match(Set dst (MulReductionVF dst src));
3864 effect(TEMP dst, TEMP vtmp);
3865 format %{ "vector_reduction_fp $dst,$src ; using $vtmp as TEMP" %}
3866 ins_encode %{
3867 int opcode = this->ideal_Opcode();
3868 int vlen = vector_length(this, $src);
3869 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
3870 %}
3871 ins_pipe( pipe_slow );
3872 %}
3873
3874 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
3875 predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
3876 match(Set dst (AddReductionVF dst src));
3877 match(Set dst (MulReductionVF dst src));
3878 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3879 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3880 ins_encode %{
3881 int opcode = this->ideal_Opcode();
3882 int vlen = vector_length(this, $src);
3883 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3884 %}
3885 ins_pipe( pipe_slow );
3886 %}
3887
3888 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
3889 predicate(n->in(2)->bottom_type()->is_vect()->length() == 16);
3890 match(Set dst (AddReductionVF dst src));
3891 match(Set dst (MulReductionVF dst src));
3892 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3893 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3894 ins_encode %{
3895 int opcode = this->ideal_Opcode();
3896 int vlen = vector_length(this, $src);
3897 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3898 %}
3899 ins_pipe( pipe_slow );
3900 %}
3901
3902 // =======================Double Reduction==========================================
3903
3904 instruct reduction2D(regD dst, vec src, vec vtmp) %{
3905 predicate(n->in(2)->bottom_type()->is_vect()->length() == 2);
3906 match(Set dst (AddReductionVD dst src));
3907 match(Set dst (MulReductionVD dst src));
3908 effect(TEMP dst, TEMP vtmp);
3909 format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
3910 ins_encode %{
3911 int opcode = this->ideal_Opcode();
3912 int vlen = vector_length(this, $src);
3913 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
3914 %}
3915 ins_pipe( pipe_slow );
3916 %}
3917
3918 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
3919 predicate(n->in(2)->bottom_type()->is_vect()->length() == 4);
3920 match(Set dst (AddReductionVD dst src));
3921 match(Set dst (MulReductionVD dst src));
3922 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3923 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3924 ins_encode %{
3925 int opcode = this->ideal_Opcode();
3926 int vlen = vector_length(this, $src);
3927 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3928 %}
3929 ins_pipe( pipe_slow );
3930 %}
3931
3932 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
3933 predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
3934 match(Set dst (AddReductionVD dst src));
3935 match(Set dst (MulReductionVD dst src));
3936 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
3937 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
3938 ins_encode %{
3939 int opcode = this->ideal_Opcode();
3940 int vlen = vector_length(this, $src);
3941 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
3942 %}
3943 ins_pipe( pipe_slow );
3944 %}
3945
3946 // ====================VECTOR ARITHMETIC=======================================
3947
3948 // --------------------------------- ADD --------------------------------------
3949
3950 // Bytes vector add
3951 instruct vaddB(vec dst, vec src) %{
3952 predicate(UseAVX == 0);
3953 match(Set dst (AddVB dst src));
3954 format %{ "paddb $dst,$src\t! add packedB" %}
3955 ins_encode %{
3956 __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3957 %}
3958 ins_pipe( pipe_slow );
3959 %}
3960
3961 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
3962 predicate(UseAVX > 0);
3963 match(Set dst (AddVB src1 src2));
3964 format %{ "vpaddb $dst,$src1,$src2\t! add packedB" %}
3965 ins_encode %{
3966 int vector_len = vector_length_encoding(this);
3967 __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
3968 %}
3969 ins_pipe( pipe_slow );
3970 %}
3971
3972 instruct vaddB_mem(vec dst, vec src, memory mem) %{
3973 predicate(UseAVX > 0);
3974 match(Set dst (AddVB src (LoadVector mem)));
3975 format %{ "vpaddb $dst,$src,$mem\t! add packedB" %}
3976 ins_encode %{
3977 int vector_len = vector_length_encoding(this);
3978 __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
3979 %}
3980 ins_pipe( pipe_slow );
3981 %}
3982
3983 // Shorts/Chars vector add
3984 instruct vaddS(vec dst, vec src) %{
3985 predicate(UseAVX == 0);
3986 match(Set dst (AddVS dst src));
3987 format %{ "paddw $dst,$src\t! add packedS" %}
3988 ins_encode %{
3989 __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3990 %}
3991 ins_pipe( pipe_slow );
3992 %}
3993
3994 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
3995 predicate(UseAVX > 0);
3996 match(Set dst (AddVS src1 src2));
3997 format %{ "vpaddw $dst,$src1,$src2\t! add packedS" %}
3998 ins_encode %{
3999 int vector_len = vector_length_encoding(this);
4000 __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4001 %}
4002 ins_pipe( pipe_slow );
4003 %}
4004
4005 instruct vaddS_mem(vec dst, vec src, memory mem) %{
4006 predicate(UseAVX > 0);
4007 match(Set dst (AddVS src (LoadVector mem)));
4008 format %{ "vpaddw $dst,$src,$mem\t! add packedS" %}
4009 ins_encode %{
4010 int vector_len = vector_length_encoding(this);
4011 __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4012 %}
4013 ins_pipe( pipe_slow );
4014 %}
4015
4016 // Integers vector add
4017 instruct vaddI(vec dst, vec src) %{
4018 predicate(UseAVX == 0);
4019 match(Set dst (AddVI dst src));
4020 format %{ "paddd $dst,$src\t! add packedI" %}
4021 ins_encode %{
4022 __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4023 %}
4024 ins_pipe( pipe_slow );
4025 %}
4026
4027 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4028 predicate(UseAVX > 0);
4029 match(Set dst (AddVI src1 src2));
4030 format %{ "vpaddd $dst,$src1,$src2\t! add packedI" %}
4031 ins_encode %{
4032 int vector_len = vector_length_encoding(this);
4033 __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4034 %}
4035 ins_pipe( pipe_slow );
4036 %}
4037
4038
4039 instruct vaddI_mem(vec dst, vec src, memory mem) %{
4040 predicate(UseAVX > 0);
4041 match(Set dst (AddVI src (LoadVector mem)));
4042 format %{ "vpaddd $dst,$src,$mem\t! add packedI" %}
4043 ins_encode %{
4044 int vector_len = vector_length_encoding(this);
4045 __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4046 %}
4047 ins_pipe( pipe_slow );
4048 %}
4049
4050 // Longs vector add
4051 instruct vaddL(vec dst, vec src) %{
4052 predicate(UseAVX == 0);
4053 match(Set dst (AddVL dst src));
4054 format %{ "paddq $dst,$src\t! add packedL" %}
4055 ins_encode %{
4056 __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4057 %}
4058 ins_pipe( pipe_slow );
4059 %}
4060
4061 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4062 predicate(UseAVX > 0);
4063 match(Set dst (AddVL src1 src2));
4064 format %{ "vpaddq $dst,$src1,$src2\t! add packedL" %}
4065 ins_encode %{
4066 int vector_len = vector_length_encoding(this);
4067 __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4068 %}
4069 ins_pipe( pipe_slow );
4070 %}
4071
4072 instruct vaddL_mem(vec dst, vec src, memory mem) %{
4073 predicate(UseAVX > 0);
4074 match(Set dst (AddVL src (LoadVector mem)));
4075 format %{ "vpaddq $dst,$src,$mem\t! add packedL" %}
4076 ins_encode %{
4077 int vector_len = vector_length_encoding(this);
4078 __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4079 %}
4080 ins_pipe( pipe_slow );
4081 %}
4082
4083 // Floats vector add
4084 instruct vaddF(vec dst, vec src) %{
4085 predicate(UseAVX == 0);
4086 match(Set dst (AddVF dst src));
4087 format %{ "addps $dst,$src\t! add packedF" %}
4088 ins_encode %{
4089 __ addps($dst$$XMMRegister, $src$$XMMRegister);
4090 %}
4091 ins_pipe( pipe_slow );
4092 %}
4093
4094 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
4095 predicate(UseAVX > 0);
4096 match(Set dst (AddVF src1 src2));
4097 format %{ "vaddps $dst,$src1,$src2\t! add packedF" %}
4098 ins_encode %{
4099 int vector_len = vector_length_encoding(this);
4100 __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4101 %}
4102 ins_pipe( pipe_slow );
4103 %}
4104
4105 instruct vaddF_mem(vec dst, vec src, memory mem) %{
4106 predicate(UseAVX > 0);
4107 match(Set dst (AddVF src (LoadVector mem)));
4108 format %{ "vaddps $dst,$src,$mem\t! add packedF" %}
4109 ins_encode %{
4110 int vector_len = vector_length_encoding(this);
4111 __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4112 %}
4113 ins_pipe( pipe_slow );
4114 %}
4115
4116 // Doubles vector add
4117 instruct vaddD(vec dst, vec src) %{
4118 predicate(UseAVX == 0);
4119 match(Set dst (AddVD dst src));
4120 format %{ "addpd $dst,$src\t! add packedD" %}
4121 ins_encode %{
4122 __ addpd($dst$$XMMRegister, $src$$XMMRegister);
4123 %}
4124 ins_pipe( pipe_slow );
4125 %}
4126
4127 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
4128 predicate(UseAVX > 0);
4129 match(Set dst (AddVD src1 src2));
4130 format %{ "vaddpd $dst,$src1,$src2\t! add packedD" %}
4131 ins_encode %{
4132 int vector_len = vector_length_encoding(this);
4133 __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4134 %}
4135 ins_pipe( pipe_slow );
4136 %}
4137
4138 instruct vaddD_mem(vec dst, vec src, memory mem) %{
4139 predicate(UseAVX > 0);
4140 match(Set dst (AddVD src (LoadVector mem)));
4141 format %{ "vaddpd $dst,$src,$mem\t! add packedD" %}
4142 ins_encode %{
4143 int vector_len = vector_length_encoding(this);
4144 __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4145 %}
4146 ins_pipe( pipe_slow );
4147 %}
4148
4149 // --------------------------------- SUB --------------------------------------
4150
4151 // Bytes vector sub
4152 instruct vsubB(vec dst, vec src) %{
4153 predicate(UseAVX == 0);
4154 match(Set dst (SubVB dst src));
4155 format %{ "psubb $dst,$src\t! sub packedB" %}
4156 ins_encode %{
4157 __ psubb($dst$$XMMRegister, $src$$XMMRegister);
4158 %}
4159 ins_pipe( pipe_slow );
4160 %}
4161
4162 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
4163 predicate(UseAVX > 0);
4164 match(Set dst (SubVB src1 src2));
4165 format %{ "vpsubb $dst,$src1,$src2\t! sub packedB" %}
4166 ins_encode %{
4167 int vector_len = vector_length_encoding(this);
4168 __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4169 %}
4170 ins_pipe( pipe_slow );
4171 %}
4172
4173 instruct vsubB_mem(vec dst, vec src, memory mem) %{
4174 predicate(UseAVX > 0);
4175 match(Set dst (SubVB src (LoadVector mem)));
4176 format %{ "vpsubb $dst,$src,$mem\t! sub packedB" %}
4177 ins_encode %{
4178 int vector_len = vector_length_encoding(this);
4179 __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4180 %}
4181 ins_pipe( pipe_slow );
4182 %}
4183
4184 // Shorts/Chars vector sub
4185 instruct vsubS(vec dst, vec src) %{
4186 predicate(UseAVX == 0);
4187 match(Set dst (SubVS dst src));
4188 format %{ "psubw $dst,$src\t! sub packedS" %}
4189 ins_encode %{
4190 __ psubw($dst$$XMMRegister, $src$$XMMRegister);
4191 %}
4192 ins_pipe( pipe_slow );
4193 %}
4194
4195
4196 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
4197 predicate(UseAVX > 0);
4198 match(Set dst (SubVS src1 src2));
4199 format %{ "vpsubw $dst,$src1,$src2\t! sub packedS" %}
4200 ins_encode %{
4201 int vector_len = vector_length_encoding(this);
4202 __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4203 %}
4204 ins_pipe( pipe_slow );
4205 %}
4206
4207 instruct vsubS_mem(vec dst, vec src, memory mem) %{
4208 predicate(UseAVX > 0);
4209 match(Set dst (SubVS src (LoadVector mem)));
4210 format %{ "vpsubw $dst,$src,$mem\t! sub packedS" %}
4211 ins_encode %{
4212 int vector_len = vector_length_encoding(this);
4213 __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4214 %}
4215 ins_pipe( pipe_slow );
4216 %}
4217
4218 // Integers vector sub
4219 instruct vsubI(vec dst, vec src) %{
4220 predicate(UseAVX == 0);
4221 match(Set dst (SubVI dst src));
4222 format %{ "psubd $dst,$src\t! sub packedI" %}
4223 ins_encode %{
4224 __ psubd($dst$$XMMRegister, $src$$XMMRegister);
4225 %}
4226 ins_pipe( pipe_slow );
4227 %}
4228
4229 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
4230 predicate(UseAVX > 0);
4231 match(Set dst (SubVI src1 src2));
4232 format %{ "vpsubd $dst,$src1,$src2\t! sub packedI" %}
4233 ins_encode %{
4234 int vector_len = vector_length_encoding(this);
4235 __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4236 %}
4237 ins_pipe( pipe_slow );
4238 %}
4239
4240 instruct vsubI_mem(vec dst, vec src, memory mem) %{
4241 predicate(UseAVX > 0);
4242 match(Set dst (SubVI src (LoadVector mem)));
4243 format %{ "vpsubd $dst,$src,$mem\t! sub packedI" %}
4244 ins_encode %{
4245 int vector_len = vector_length_encoding(this);
4246 __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4247 %}
4248 ins_pipe( pipe_slow );
4249 %}
4250
4251 // Longs vector sub
4252 instruct vsubL(vec dst, vec src) %{
4253 predicate(UseAVX == 0);
4254 match(Set dst (SubVL dst src));
4255 format %{ "psubq $dst,$src\t! sub packedL" %}
4256 ins_encode %{
4257 __ psubq($dst$$XMMRegister, $src$$XMMRegister);
4258 %}
4259 ins_pipe( pipe_slow );
4260 %}
4261
4262 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
4263 predicate(UseAVX > 0);
4264 match(Set dst (SubVL src1 src2));
4265 format %{ "vpsubq $dst,$src1,$src2\t! sub packedL" %}
4266 ins_encode %{
4267 int vector_len = vector_length_encoding(this);
4268 __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4269 %}
4270 ins_pipe( pipe_slow );
4271 %}
4272
4273
4274 instruct vsubL_mem(vec dst, vec src, memory mem) %{
4275 predicate(UseAVX > 0);
4276 match(Set dst (SubVL src (LoadVector mem)));
4277 format %{ "vpsubq $dst,$src,$mem\t! sub packedL" %}
4278 ins_encode %{
4279 int vector_len = vector_length_encoding(this);
4280 __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4281 %}
4282 ins_pipe( pipe_slow );
4283 %}
4284
4285 // Floats vector sub
4286 instruct vsubF(vec dst, vec src) %{
4287 predicate(UseAVX == 0);
4288 match(Set dst (SubVF dst src));
4289 format %{ "subps $dst,$src\t! sub packedF" %}
4290 ins_encode %{
4291 __ subps($dst$$XMMRegister, $src$$XMMRegister);
4292 %}
4293 ins_pipe( pipe_slow );
4294 %}
4295
4296 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
4297 predicate(UseAVX > 0);
4298 match(Set dst (SubVF src1 src2));
4299 format %{ "vsubps $dst,$src1,$src2\t! sub packedF" %}
4300 ins_encode %{
4301 int vector_len = vector_length_encoding(this);
4302 __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4303 %}
4304 ins_pipe( pipe_slow );
4305 %}
4306
4307 instruct vsubF_mem(vec dst, vec src, memory mem) %{
4308 predicate(UseAVX > 0);
4309 match(Set dst (SubVF src (LoadVector mem)));
4310 format %{ "vsubps $dst,$src,$mem\t! sub packedF" %}
4311 ins_encode %{
4312 int vector_len = vector_length_encoding(this);
4313 __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4314 %}
4315 ins_pipe( pipe_slow );
4316 %}
4317
4318 // Doubles vector sub
4319 instruct vsubD(vec dst, vec src) %{
4320 predicate(UseAVX == 0);
4321 match(Set dst (SubVD dst src));
4322 format %{ "subpd $dst,$src\t! sub packedD" %}
4323 ins_encode %{
4324 __ subpd($dst$$XMMRegister, $src$$XMMRegister);
4325 %}
4326 ins_pipe( pipe_slow );
4327 %}
4328
4329 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
4330 predicate(UseAVX > 0);
4331 match(Set dst (SubVD src1 src2));
4332 format %{ "vsubpd $dst,$src1,$src2\t! sub packedD" %}
4333 ins_encode %{
4334 int vector_len = vector_length_encoding(this);
4335 __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4336 %}
4337 ins_pipe( pipe_slow );
4338 %}
4339
4340 instruct vsubD_mem(vec dst, vec src, memory mem) %{
4341 predicate(UseAVX > 0);
4342 match(Set dst (SubVD src (LoadVector mem)));
4343 format %{ "vsubpd $dst,$src,$mem\t! sub packedD" %}
4344 ins_encode %{
4345 int vector_len = vector_length_encoding(this);
4346 __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4347 %}
4348 ins_pipe( pipe_slow );
4349 %}
4350
4351 // --------------------------------- MUL --------------------------------------
4352
4353 // Byte vector mul
4354 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
4355 predicate(n->as_Vector()->length() == 4 ||
4356 n->as_Vector()->length() == 8);
4357 match(Set dst (MulVB src1 src2));
4358 effect(TEMP dst, TEMP tmp, TEMP scratch);
4359 format %{"vector_mulB $dst,$src1,$src2" %}
4360 ins_encode %{
4361 assert(UseSSE > 3, "required");
4362 __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
4363 __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
4364 __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
4365 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4366 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
4367 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
4368 %}
4369 ins_pipe( pipe_slow );
4370 %}
4371
4372 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
4373 predicate(n->as_Vector()->length() == 16 && UseAVX <= 1);
4374 match(Set dst (MulVB src1 src2));
4375 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4376 format %{"vector_mulB $dst,$src1,$src2" %}
4377 ins_encode %{
4378 assert(UseSSE > 3, "required");
4379 __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
4380 __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
4381 __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
4382 __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
4383 __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
4384 __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4385 __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
4386 __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
4387 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4388 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
4389 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
4390 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
4391 %}
4392 ins_pipe( pipe_slow );
4393 %}
4394
4395 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
4396 predicate(n->as_Vector()->length() == 16 && UseAVX > 1);
4397 match(Set dst (MulVB src1 src2));
4398 effect(TEMP dst, TEMP tmp, TEMP scratch);
4399 format %{"vector_mulB $dst,$src1,$src2" %}
4400 ins_encode %{
4401 int vector_len = Assembler::AVX_256bit;
4402 __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
4403 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
4404 __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
4405 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4406 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
4407 __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
4408 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
4409 %}
4410 ins_pipe( pipe_slow );
4411 %}
4412
4413 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
4414 predicate(n->as_Vector()->length() == 32);
4415 match(Set dst (MulVB src1 src2));
4416 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4417 format %{"vector_mulB $dst,$src1,$src2" %}
4418 ins_encode %{
4419 assert(UseAVX > 1, "required");
4420 int vector_len = Assembler::AVX_256bit;
4421 __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
4422 __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
4423 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4424 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4425 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4426 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
4427 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
4428 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4429 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4430 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4431 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4432 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4433 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4434 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
4435 %}
4436 ins_pipe( pipe_slow );
4437 %}
4438
4439 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
4440 predicate(n->as_Vector()->length() == 64);
4441 match(Set dst (MulVB src1 src2));
4442 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4443 format %{"vector_mulB $dst,$src1,$src2\n\t" %}
4444 ins_encode %{
4445 assert(UseAVX > 2, "required");
4446 int vector_len = Assembler::AVX_512bit;
4447 __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
4448 __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
4449 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4450 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4451 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4452 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
4453 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
4454 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4455 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4456 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4457 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4458 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4459 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4460 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
4461 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4462 %}
4463 ins_pipe( pipe_slow );
4464 %}
4465
4466 // Shorts/Chars vector mul
4467 instruct vmulS(vec dst, vec src) %{
4468 predicate(UseAVX == 0);
4469 match(Set dst (MulVS dst src));
4470 format %{ "pmullw $dst,$src\t! mul packedS" %}
4471 ins_encode %{
4472 __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4473 %}
4474 ins_pipe( pipe_slow );
4475 %}
4476
4477 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
4478 predicate(UseAVX > 0);
4479 match(Set dst (MulVS src1 src2));
4480 format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
4481 ins_encode %{
4482 int vector_len = vector_length_encoding(this);
4483 __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4484 %}
4485 ins_pipe( pipe_slow );
4486 %}
4487
4488 instruct vmulS_mem(vec dst, vec src, memory mem) %{
4489 predicate(UseAVX > 0);
4490 match(Set dst (MulVS src (LoadVector mem)));
4491 format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
4492 ins_encode %{
4493 int vector_len = vector_length_encoding(this);
4494 __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4495 %}
4496 ins_pipe( pipe_slow );
4497 %}
4498
4499 // Integers vector mul
4500 instruct vmulI(vec dst, vec src) %{
4501 predicate(UseAVX == 0);
4502 match(Set dst (MulVI dst src));
4503 format %{ "pmulld $dst,$src\t! mul packedI" %}
4504 ins_encode %{
4505 assert(UseSSE > 3, "required");
4506 __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
4507 %}
4508 ins_pipe( pipe_slow );
4509 %}
4510
4511 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
4512 predicate(UseAVX > 0);
4513 match(Set dst (MulVI src1 src2));
4514 format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
4515 ins_encode %{
4516 int vector_len = vector_length_encoding(this);
4517 __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4518 %}
4519 ins_pipe( pipe_slow );
4520 %}
4521
4522 instruct vmulI_mem(vec dst, vec src, memory mem) %{
4523 predicate(UseAVX > 0);
4524 match(Set dst (MulVI src (LoadVector mem)));
4525 format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
4526 ins_encode %{
4527 int vector_len = vector_length_encoding(this);
4528 __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4529 %}
4530 ins_pipe( pipe_slow );
4531 %}
4532
4533 // Longs vector mul
4534 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
4535 match(Set dst (MulVL src1 src2));
4536 format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
4537 ins_encode %{
4538 assert(UseAVX > 2, "required");
4539 int vector_len = vector_length_encoding(this);
4540 __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4541 %}
4542 ins_pipe( pipe_slow );
4543 %}
4544
4545 instruct vmulL_mem(vec dst, vec src, memory mem) %{
4546 match(Set dst (MulVL src (LoadVector mem)));
4547 format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
4548 ins_encode %{
4549 assert(UseAVX > 2, "required");
4550 int vector_len = vector_length_encoding(this);
4551 __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4552 %}
4553 ins_pipe( pipe_slow );
4554 %}
4555
4556 // Floats vector mul
4557 instruct vmulF(vec dst, vec src) %{
4558 predicate(UseAVX == 0);
4559 match(Set dst (MulVF dst src));
4560 format %{ "mulps $dst,$src\t! mul packedF" %}
4561 ins_encode %{
4562 __ mulps($dst$$XMMRegister, $src$$XMMRegister);
4563 %}
4564 ins_pipe( pipe_slow );
4565 %}
4566
4567 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
4568 predicate(UseAVX > 0);
4569 match(Set dst (MulVF src1 src2));
4570 format %{ "vmulps $dst,$src1,$src2\t! mul packedF" %}
4571 ins_encode %{
4572 int vector_len = vector_length_encoding(this);
4573 __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4574 %}
4575 ins_pipe( pipe_slow );
4576 %}
4577
4578 instruct vmulF_mem(vec dst, vec src, memory mem) %{
4579 predicate(UseAVX > 0);
4580 match(Set dst (MulVF src (LoadVector mem)));
4581 format %{ "vmulps $dst,$src,$mem\t! mul packedF" %}
4582 ins_encode %{
4583 int vector_len = vector_length_encoding(this);
4584 __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4585 %}
4586 ins_pipe( pipe_slow );
4587 %}
4588
4589 // Doubles vector mul
4590 instruct vmulD(vec dst, vec src) %{
4591 predicate(UseAVX == 0);
4592 match(Set dst (MulVD dst src));
4593 format %{ "mulpd $dst,$src\t! mul packedD" %}
4594 ins_encode %{
4595 __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
4596 %}
4597 ins_pipe( pipe_slow );
4598 %}
4599
4600 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
4601 predicate(UseAVX > 0);
4602 match(Set dst (MulVD src1 src2));
4603 format %{ "vmulpd $dst,$src1,$src2\t! mul packedD" %}
4604 ins_encode %{
4605 int vector_len = vector_length_encoding(this);
4606 __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4607 %}
4608 ins_pipe( pipe_slow );
4609 %}
4610
4611 instruct vmulD_mem(vec dst, vec src, memory mem) %{
4612 predicate(UseAVX > 0);
4613 match(Set dst (MulVD src (LoadVector mem)));
4614 format %{ "vmulpd $dst,$src,$mem\t! mul packedD" %}
4615 ins_encode %{
4616 int vector_len = vector_length_encoding(this);
4617 __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4618 %}
4619 ins_pipe( pipe_slow );
4620 %}
4621
4622 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
4623 predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4624 match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
4625 effect(TEMP dst, USE src1, USE src2);
4626 format %{ "cmpps.$copnd $dst, $src1, $src2 ! vcmovevf, cond=$cop\n\t"
4627 "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
4628 %}
4629 ins_encode %{
4630 int vector_len = 1;
4631 int cond = (Assembler::Condition)($copnd$$cmpcode);
4632 __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
4633 __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
4634 %}
4635 ins_pipe( pipe_slow );
4636 %}
4637
4638 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
4639 predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4640 match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
4641 effect(TEMP dst, USE src1, USE src2);
4642 format %{ "cmppd.$copnd $dst, $src1, $src2 ! vcmovevd, cond=$cop\n\t"
4643 "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
4644 %}
4645 ins_encode %{
4646 int vector_len = 1;
4647 int cond = (Assembler::Condition)($copnd$$cmpcode);
4648 __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
4649 __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
4650 %}
4651 ins_pipe( pipe_slow );
4652 %}
4653
4654 // --------------------------------- DIV --------------------------------------
4655
4656 // Floats vector div
4657 instruct vdivF(vec dst, vec src) %{
4658 predicate(UseAVX == 0);
4659 match(Set dst (DivVF dst src));
4660 format %{ "divps $dst,$src\t! div packedF" %}
4661 ins_encode %{
4662 __ divps($dst$$XMMRegister, $src$$XMMRegister);
4663 %}
4664 ins_pipe( pipe_slow );
4665 %}
4666
4667 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
4668 predicate(UseAVX > 0);
4669 match(Set dst (DivVF src1 src2));
4670 format %{ "vdivps $dst,$src1,$src2\t! div packedF" %}
4671 ins_encode %{
4672 int vector_len = vector_length_encoding(this);
4673 __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4674 %}
4675 ins_pipe( pipe_slow );
4676 %}
4677
4678 instruct vdivF_mem(vec dst, vec src, memory mem) %{
4679 predicate(UseAVX > 0);
4680 match(Set dst (DivVF src (LoadVector mem)));
4681 format %{ "vdivps $dst,$src,$mem\t! div packedF" %}
4682 ins_encode %{
4683 int vector_len = vector_length_encoding(this);
4684 __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4685 %}
4686 ins_pipe( pipe_slow );
4687 %}
4688
4689 // Doubles vector div
4690 instruct vdivD(vec dst, vec src) %{
4691 predicate(UseAVX == 0);
4692 match(Set dst (DivVD dst src));
4693 format %{ "divpd $dst,$src\t! div packedD" %}
4694 ins_encode %{
4695 __ divpd($dst$$XMMRegister, $src$$XMMRegister);
4696 %}
4697 ins_pipe( pipe_slow );
4698 %}
4699
4700 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
4701 predicate(UseAVX > 0);
4702 match(Set dst (DivVD src1 src2));
4703 format %{ "vdivpd $dst,$src1,$src2\t! div packedD" %}
4704 ins_encode %{
4705 int vector_len = vector_length_encoding(this);
4706 __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4707 %}
4708 ins_pipe( pipe_slow );
4709 %}
4710
4711 instruct vdivD_mem(vec dst, vec src, memory mem) %{
4712 predicate(UseAVX > 0);
4713 match(Set dst (DivVD src (LoadVector mem)));
4714 format %{ "vdivpd $dst,$src,$mem\t! div packedD" %}
4715 ins_encode %{
4716 int vector_len = vector_length_encoding(this);
4717 __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
4718 %}
4719 ins_pipe( pipe_slow );
4720 %}
4721
4722 // --------------------------------- Sqrt --------------------------------------
4723
4724 instruct vsqrtF_reg(vec dst, vec src) %{
4725 match(Set dst (SqrtVF src));
4726 format %{ "vsqrtps $dst,$src\t! sqrt packedF" %}
4727 ins_encode %{
4728 assert(UseAVX > 0, "required");
4729 int vector_len = vector_length_encoding(this);
4730 __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4731 %}
4732 ins_pipe( pipe_slow );
4733 %}
4734
4735 instruct vsqrtF_mem(vec dst, memory mem) %{
4736 match(Set dst (SqrtVF (LoadVector mem)));
4737 format %{ "vsqrtps $dst,$mem\t! sqrt packedF" %}
4738 ins_encode %{
4739 assert(UseAVX > 0, "required");
4740 int vector_len = vector_length_encoding(this);
4741 __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
4742 %}
4743 ins_pipe( pipe_slow );
4744 %}
4745
4746 // Floating point vector sqrt
4747 instruct vsqrtD_reg(vec dst, vec src) %{
4748 match(Set dst (SqrtVD src));
4749 format %{ "vsqrtpd $dst,$src\t! sqrt packedD" %}
4750 ins_encode %{
4751 assert(UseAVX > 0, "required");
4752 int vector_len = vector_length_encoding(this);
4753 __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4754 %}
4755 ins_pipe( pipe_slow );
4756 %}
4757
4758 instruct vsqrtD_mem(vec dst, memory mem) %{
4759 match(Set dst (SqrtVD (LoadVector mem)));
4760 format %{ "vsqrtpd $dst,$mem\t! sqrt packedD" %}
4761 ins_encode %{
4762 assert(UseAVX > 0, "required");
4763 int vector_len = vector_length_encoding(this);
4764 __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
4765 %}
4766 ins_pipe( pipe_slow );
4767 %}
4768
4769 // ------------------------------ Shift ---------------------------------------
4770
4771 // Left and right shift count vectors are the same on x86
4772 // (only lowest bits of xmm reg are used for count).
4773 instruct vshiftcnt(vec dst, rRegI cnt) %{
4774 match(Set dst (LShiftCntV cnt));
4775 match(Set dst (RShiftCntV cnt));
4776 format %{ "movdl $dst,$cnt\t! load shift count" %}
4777 ins_encode %{
4778 __ movdl($dst$$XMMRegister, $cnt$$Register);
4779 %}
4780 ins_pipe( pipe_slow );
4781 %}
4782
4783 // Byte vector shift
4784 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4785 predicate(n->as_Vector()->length() <= 8);
4786 match(Set dst (LShiftVB src shift));
4787 match(Set dst (RShiftVB src shift));
4788 match(Set dst (URShiftVB src shift));
4789 effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
4790 format %{"vector_byte_shift $dst,$src,$shift" %}
4791 ins_encode %{
4792 assert(UseSSE > 3, "required");
4793 int opcode = this->ideal_Opcode();
4794 __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
4795 __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
4796 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4797 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
4798 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
4799 %}
4800 ins_pipe( pipe_slow );
4801 %}
4802
4803 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
4804 predicate(n->as_Vector()->length() == 16 && UseAVX <= 1);
4805 match(Set dst (LShiftVB src shift));
4806 match(Set dst (RShiftVB src shift));
4807 match(Set dst (URShiftVB src shift));
4808 effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
4809 format %{"vector_byte_shift $dst,$src,$shift" %}
4810 ins_encode %{
4811 assert(UseSSE > 3, "required");
4812 int opcode = this->ideal_Opcode();
4813
4814 __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister);
4815 __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
4816 __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
4817 __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
4818 __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
4819 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4820 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
4821 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
4822 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
4823 %}
4824 ins_pipe( pipe_slow );
4825 %}
4826
4827 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4828 predicate(n->as_Vector()->length() == 16 && UseAVX > 1);
4829 match(Set dst (LShiftVB src shift));
4830 match(Set dst (RShiftVB src shift));
4831 match(Set dst (URShiftVB src shift));
4832 effect(TEMP dst, TEMP tmp, TEMP scratch);
4833 format %{"vector_byte_shift $dst,$src,$shift" %}
4834 ins_encode %{
4835 int opcode = this->ideal_Opcode();
4836 int vector_len = Assembler::AVX_256bit;
4837 __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
4838 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
4839 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
4840 __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
4841 __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
4842 %}
4843 ins_pipe( pipe_slow );
4844 %}
4845
4846 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4847 predicate(n->as_Vector()->length() == 32);
4848 match(Set dst (LShiftVB src shift));
4849 match(Set dst (RShiftVB src shift));
4850 match(Set dst (URShiftVB src shift));
4851 effect(TEMP dst, TEMP tmp, TEMP scratch);
4852 format %{"vector_byte_shift $dst,$src,$shift" %}
4853 ins_encode %{
4854 assert(UseAVX > 1, "required");
4855 int opcode = this->ideal_Opcode();
4856 int vector_len = Assembler::AVX_256bit;
4857 __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
4858 __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4859 __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
4860 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
4861 __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len);
4862 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
4863 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
4864 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
4865 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
4866 %}
4867 ins_pipe( pipe_slow );
4868 %}
4869
4870 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
4871 predicate(n->as_Vector()->length() == 64);
4872 match(Set dst (LShiftVB src shift));
4873 match(Set dst (RShiftVB src shift));
4874 match(Set dst (URShiftVB src shift));
4875 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
4876 format %{"vector_byte_shift $dst,$src,$shift" %}
4877 ins_encode %{
4878 assert(UseAVX > 2, "required");
4879 int opcode = this->ideal_Opcode();
4880 int vector_len = Assembler::AVX_512bit;
4881 __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
4882 __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
4883 __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
4884 __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len);
4885 __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
4886 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
4887 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4888 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
4889 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4890 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4891 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
4892 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
4893 %}
4894 ins_pipe( pipe_slow );
4895 %}
4896
4897 // Shorts vector logical right shift produces incorrect Java result
4898 // for negative data because java code convert short value into int with
4899 // sign extension before a shift. But char vectors are fine since chars are
4900 // unsigned values.
4901 // Shorts/Chars vector left shift
4902 instruct vshiftS(vec dst, vec src, vec shift) %{
4903 match(Set dst (LShiftVS src shift));
4904 match(Set dst (RShiftVS src shift));
4905 match(Set dst (URShiftVS src shift));
4906 effect(TEMP dst, USE src, USE shift);
4907 format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %}
4908 ins_encode %{
4909 int opcode = this->ideal_Opcode();
4910 if (UseAVX > 0) {
4911 int vlen_enc = vector_length_encoding(this);
4912 __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
4913 } else {
4914 int vlen = vector_length(this);
4915 if (vlen == 2) {
4916 __ movflt($dst$$XMMRegister, $src$$XMMRegister);
4917 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4918 } else if (vlen == 4) {
4919 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
4920 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4921 } else {
4922 assert (vlen == 8, "sanity");
4923 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4924 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4925 }
4926 }
4927 %}
4928 ins_pipe( pipe_slow );
4929 %}
4930
4931 // Integers vector left shift
4932 instruct vshiftI(vec dst, vec src, vec shift) %{
4933 match(Set dst (LShiftVI src shift));
4934 match(Set dst (RShiftVI src shift));
4935 match(Set dst (URShiftVI src shift));
4936 effect(TEMP dst, USE src, USE shift);
4937 format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %}
4938 ins_encode %{
4939 int opcode = this->ideal_Opcode();
4940 if (UseAVX > 0) {
4941 int vector_len = vector_length_encoding(this);
4942 __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
4943 } else {
4944 int vlen = vector_length(this);
4945 if (vlen == 2) {
4946 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
4947 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4948 } else {
4949 assert(vlen == 4, "sanity");
4950 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4951 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4952 }
4953 }
4954 %}
4955 ins_pipe( pipe_slow );
4956 %}
4957
4958 // Longs vector shift
4959 instruct vshiftL(vec dst, vec src, vec shift) %{
4960 match(Set dst (LShiftVL src shift));
4961 match(Set dst (URShiftVL src shift));
4962 effect(TEMP dst, USE src, USE shift);
4963 format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %}
4964 ins_encode %{
4965 int opcode = this->ideal_Opcode();
4966 if (UseAVX > 0) {
4967 int vector_len = vector_length_encoding(this);
4968 __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
4969 } else {
4970 assert(vector_length(this) == 2, "");
4971 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4972 __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
4973 }
4974 %}
4975 ins_pipe( pipe_slow );
4976 %}
4977
4978 // -------------------ArithmeticRightShift -----------------------------------
4979 // Long vector arithmetic right shift
4980 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
4981 predicate(UseAVX <= 2);
4982 match(Set dst (RShiftVL src shift));
4983 effect(TEMP dst, TEMP tmp, TEMP scratch);
4984 format %{ "vshiftq $dst,$src,$shift" %}
4985 ins_encode %{
4986 uint vlen = vector_length(this);
4987 if (vlen == 2) {
4988 assert(UseSSE >= 2, "required");
4989 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
4990 __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
4991 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
4992 __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
4993 __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
4994 __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
4995 } else {
4996 assert(vlen == 4, "sanity");
4997 assert(UseAVX > 1, "required");
4998 int vector_len = Assembler::AVX_256bit;
4999 __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
5000 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
5001 __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
5002 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
5003 __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
5004 }
5005 %}
5006 ins_pipe( pipe_slow );
5007 %}
5008
5009 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
5010 predicate(UseAVX > 2);
5011 match(Set dst (RShiftVL src shift));
5012 format %{ "vshiftq $dst,$src,$shift" %}
5013 ins_encode %{
5014 int vector_len = vector_length_encoding(this);
5015 __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
5016 %}
5017 ins_pipe( pipe_slow );
5018 %}
5019
5020 // --------------------------------- AND --------------------------------------
5021
5022 instruct vand(vec dst, vec src) %{
5023 predicate(UseAVX == 0);
5024 match(Set dst (AndV dst src));
5025 format %{ "pand $dst,$src\t! and vectors" %}
5026 ins_encode %{
5027 __ pand($dst$$XMMRegister, $src$$XMMRegister);
5028 %}
5029 ins_pipe( pipe_slow );
5030 %}
5031
5032 instruct vand_reg(vec dst, vec src1, vec src2) %{
5033 predicate(UseAVX > 0);
5034 match(Set dst (AndV src1 src2));
5035 format %{ "vpand $dst,$src1,$src2\t! and vectors" %}
5036 ins_encode %{
5037 int vector_len = vector_length_encoding(this);
5038 __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5039 %}
5040 ins_pipe( pipe_slow );
5041 %}
5042
5043 instruct vand_mem(vec dst, vec src, memory mem) %{
5044 predicate(UseAVX > 0);
5045 match(Set dst (AndV src (LoadVector mem)));
5046 format %{ "vpand $dst,$src,$mem\t! and vectors" %}
5047 ins_encode %{
5048 int vector_len = vector_length_encoding(this);
5049 __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5050 %}
5051 ins_pipe( pipe_slow );
5052 %}
5053
5054 // --------------------------------- OR ---------------------------------------
5055
5056 instruct vor(vec dst, vec src) %{
5057 predicate(UseAVX == 0);
5058 match(Set dst (OrV dst src));
5059 format %{ "por $dst,$src\t! or vectors" %}
5060 ins_encode %{
5061 __ por($dst$$XMMRegister, $src$$XMMRegister);
5062 %}
5063 ins_pipe( pipe_slow );
5064 %}
5065
5066 instruct vor_reg(vec dst, vec src1, vec src2) %{
5067 predicate(UseAVX > 0);
5068 match(Set dst (OrV src1 src2));
5069 format %{ "vpor $dst,$src1,$src2\t! or vectors" %}
5070 ins_encode %{
5071 int vector_len = vector_length_encoding(this);
5072 __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5073 %}
5074 ins_pipe( pipe_slow );
5075 %}
5076
5077 instruct vor_mem(vec dst, vec src, memory mem) %{
5078 predicate(UseAVX > 0);
5079 match(Set dst (OrV src (LoadVector mem)));
5080 format %{ "vpor $dst,$src,$mem\t! or vectors" %}
5081 ins_encode %{
5082 int vector_len = vector_length_encoding(this);
5083 __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5084 %}
5085 ins_pipe( pipe_slow );
5086 %}
5087
5088 // --------------------------------- XOR --------------------------------------
5089
5090 instruct vxor(vec dst, vec src) %{
5091 predicate(UseAVX == 0);
5092 match(Set dst (XorV dst src));
5093 format %{ "pxor $dst,$src\t! xor vectors" %}
5094 ins_encode %{
5095 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5096 %}
5097 ins_pipe( pipe_slow );
5098 %}
5099
5100 instruct vxor_reg(vec dst, vec src1, vec src2) %{
5101 predicate(UseAVX > 0);
5102 match(Set dst (XorV src1 src2));
5103 format %{ "vpxor $dst,$src1,$src2\t! xor vectors" %}
5104 ins_encode %{
5105 int vector_len = vector_length_encoding(this);
5106 __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5107 %}
5108 ins_pipe( pipe_slow );
5109 %}
5110
5111 instruct vxor_mem(vec dst, vec src, memory mem) %{
5112 predicate(UseAVX > 0);
5113 match(Set dst (XorV src (LoadVector mem)));
5114 format %{ "vpxor $dst,$src,$mem\t! xor vectors" %}
5115 ins_encode %{
5116 int vector_len = vector_length_encoding(this);
5117 __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5118 %}
5119 ins_pipe( pipe_slow );
5120 %}
5121
5122 // --------------------------------- ABS --------------------------------------
5123 // a = |a|
5124 instruct vabsB_reg(vec dst, vec src) %{
5125 match(Set dst (AbsVB src));
5126 format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
5127 ins_encode %{
5128 uint vlen = vector_length(this);
5129 if (vlen <= 16) {
5130 __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
5131 } else {
5132 int vlen_enc = vector_length_encoding(this);
5133 __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5134 }
5135 %}
5136 ins_pipe( pipe_slow );
5137 %}
5138
5139 instruct vabsS_reg(vec dst, vec src) %{
5140 match(Set dst (AbsVS src));
5141 format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
5142 ins_encode %{
5143 uint vlen = vector_length(this);
5144 if (vlen <= 8) {
5145 __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
5146 } else {
5147 int vlen_enc = vector_length_encoding(this);
5148 __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5149 }
5150 %}
5151 ins_pipe( pipe_slow );
5152 %}
5153
5154 instruct vabsI_reg(vec dst, vec src) %{
5155 match(Set dst (AbsVI src));
5156 format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
5157 ins_encode %{
5158 uint vlen = vector_length(this);
5159 if (vlen <= 4) {
5160 __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
5161 } else {
5162 int vlen_enc = vector_length_encoding(this);
5163 __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5164 }
5165 %}
5166 ins_pipe( pipe_slow );
5167 %}
5168
5169 instruct vabsL_reg(vec dst, vec src) %{
5170 match(Set dst (AbsVL src));
5171 format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
5172 ins_encode %{
5173 assert(UseAVX > 2, "required");
5174 int vector_len = vector_length_encoding(this);
5175 __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5176 %}
5177 ins_pipe( pipe_slow );
5178 %}
5179
5180 // --------------------------------- ABSNEG --------------------------------------
5181
5182 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
5183 predicate(n->as_Vector()->length() != 4); // handled by 1-operand instruction vabsneg4F
5184 match(Set dst (AbsVF src));
5185 match(Set dst (NegVF src));
5186 effect(TEMP scratch);
5187 format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
5188 ins_cost(150);
5189 ins_encode %{
5190 int opcode = this->ideal_Opcode();
5191 int vlen = vector_length(this);
5192 if (vlen == 2) {
5193 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
5194 } else {
5195 assert(vlen == 8 || vlen == 16, "required");
5196 int vlen_enc = vector_length_encoding(this);
5197 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
5198 }
5199 %}
5200 ins_pipe( pipe_slow );
5201 %}
5202
5203 instruct vabsneg4F(vec dst, rRegI scratch) %{
5204 predicate(n->as_Vector()->length() == 4);
5205 match(Set dst (AbsVF dst));
5206 match(Set dst (NegVF dst));
5207 effect(TEMP scratch);
5208 format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
5209 ins_cost(150);
5210 ins_encode %{
5211 int opcode = this->ideal_Opcode();
5212 __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
5213 %}
5214 ins_pipe( pipe_slow );
5215 %}
5216
5217 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
5218 match(Set dst (AbsVD src));
5219 match(Set dst (NegVD src));
5220 effect(TEMP scratch);
5221 format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
5222 ins_encode %{
5223 int opcode = this->ideal_Opcode();
5224 uint vlen = vector_length(this);
5225 if (vlen == 2) {
5226 assert(UseSSE >= 2, "required");
5227 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
5228 } else {
5229 int vlen_enc = vector_length_encoding(this);
5230 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
5231 }
5232 %}
5233 ins_pipe( pipe_slow );
5234 %}
5235
5236 // --------------------------------- FMA --------------------------------------
5237 // a * b + c
5238
5239 instruct vfmaF_reg(vec a, vec b, vec c) %{
5240 match(Set c (FmaVF c (Binary a b)));
5241 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
5242 ins_cost(150);
5243 ins_encode %{
5244 assert(UseFMA, "not enabled");
5245 int vector_len = vector_length_encoding(this);
5246 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
5247 %}
5248 ins_pipe( pipe_slow );
5249 %}
5250
5251 instruct vfmaF_mem(vec a, memory b, vec c) %{
5252 match(Set c (FmaVF c (Binary a (LoadVector b))));
5253 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
5254 ins_cost(150);
5255 ins_encode %{
5256 assert(UseFMA, "not enabled");
5257 int vector_len = vector_length_encoding(this);
5258 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
5259 %}
5260 ins_pipe( pipe_slow );
5261 %}
5262
5263 instruct vfmaD_reg(vec a, vec b, vec c) %{
5264 match(Set c (FmaVD c (Binary a b)));
5265 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
5266 ins_cost(150);
5267 ins_encode %{
5268 assert(UseFMA, "not enabled");
5269 int vector_len = vector_length_encoding(this);
5270 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
5271 %}
5272 ins_pipe( pipe_slow );
5273 %}
5274
5275 instruct vfmaD_mem(vec a, memory b, vec c) %{
5276 match(Set c (FmaVD c (Binary a (LoadVector b))));
5277 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
5278 ins_cost(150);
5279 ins_encode %{
5280 assert(UseFMA, "not enabled");
5281 int vector_len = vector_length_encoding(this);
5282 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
5283 %}
5284 ins_pipe( pipe_slow );
5285 %}
5286
5287 // --------------------------------- Vector Multiply Add --------------------------------------
5288
5289 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
5290 predicate(UseAVX == 0);
5291 match(Set dst (MulAddVS2VI dst src1));
5292 format %{ "pmaddwd $dst,$dst,$src1\t! muladd packedStoI" %}
5293 ins_encode %{
5294 __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
5295 %}
5296 ins_pipe( pipe_slow );
5297 %}
5298
5299 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
5300 predicate(UseAVX > 0);
5301 match(Set dst (MulAddVS2VI src1 src2));
5302 format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
5303 ins_encode %{
5304 int vector_len = vector_length_encoding(this);
5305 __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5306 %}
5307 ins_pipe( pipe_slow );
5308 %}
5309
5310 // --------------------------------- Vector Multiply Add Add ----------------------------------
5311
5312 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
5313 predicate(VM_Version::supports_avx512_vnni());
5314 match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
5315 format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
5316 ins_encode %{
5317 assert(UseAVX > 2, "required");
5318 int vector_len = vector_length_encoding(this);
5319 __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5320 %}
5321 ins_pipe( pipe_slow );
5322 ins_cost(10);
5323 %}
5324
5325 // --------------------------------- PopCount --------------------------------------
5326
5327 instruct vpopcountI(vec dst, vec src) %{
5328 match(Set dst (PopCountVI src));
5329 format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
5330 ins_encode %{
5331 assert(UsePopCountInstruction, "not enabled");
5332
5333 int vector_len = vector_length_encoding(this);
5334 __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5335 %}
5336 ins_pipe( pipe_slow );
5337 %}
5338
5339 // --------------------------------- Bitwise Ternary Logic ----------------------------------
5340
5341 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
5342 match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
5343 effect(TEMP dst);
5344 format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
5345 ins_encode %{
5346 int vector_len = vector_length_encoding(this);
5347 __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
5348 %}
5349 ins_pipe( pipe_slow );
5350 %}
5351
5352 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
5353 match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
5354 effect(TEMP dst);
|
1080 XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p,
1081 XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p,
1082 XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p,
1083 XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p,
1084 XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p
1085 #ifdef _LP64
1086 ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p,
1087 XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p,
1088 XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089 XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090 XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091 XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092 XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093 XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095 );
1096
1097 reg_class_dynamic vectorz_reg (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 %}
1102
1103
1104 //----------SOURCE BLOCK-------------------------------------------------------
1105 // This is a block of C++ code which provides values, functions, and
1106 // definitions necessary in the rest of the architecture description
1107
1108 source_hpp %{
1109 // Header information of the source block.
1110 // Method declarations/definitions which are used outside
1111 // the ad-scope can conveniently be defined here.
1112 //
1113 // To keep related declarations/definitions/uses close together,
1114 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1115
1116 class NativeJump;
1117
1118 class CallStubImpl {
1119
1120 //--------------------------------------------------------------
1149 return NativeJump::instruction_size;
1150 }
1151
1152 #ifdef _LP64
1153 static uint size_deopt_handler() {
1154 // three 5 byte instructions plus one move for unreachable address.
1155 return 15+3;
1156 }
1157 #else
1158 static uint size_deopt_handler() {
1159 // NativeCall instruction size is the same as NativeJump.
1160 // exception handler starts out as jump and can be patched to
1161 // a call be deoptimization. (4932387)
1162 // Note that this value is also credited (in output.cpp) to
1163 // the size of the code section.
1164 return 5 + NativeJump::instruction_size; // pushl(); jmp;
1165 }
1166 #endif
1167 };
1168
1169
1170 inline uint vector_length(const Node* n) {
1171 const TypeVect* vt = n->bottom_type()->is_vect();
1172 return vt->length();
1173 }
1174
1175 inline uint vector_length(const MachNode* use, MachOper* opnd) {
1176 uint def_idx = use->operand_index(opnd);
1177 Node* def = use->in(def_idx);
1178 return def->bottom_type()->is_vect()->length();
1179 }
1180
1181 inline uint vector_length_in_bytes(const Node* n) {
1182 const TypeVect* vt = n->bottom_type()->is_vect();
1183 return vt->length_in_bytes();
1184 }
1185
1186 inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1187 uint def_idx = use->operand_index(opnd);
1188 Node* def = use->in(def_idx);
1189 return def->bottom_type()->is_vect()->length_in_bytes();
1190 }
1191
1192 inline BasicType vector_element_basic_type(const Node *n) {
1193 return n->bottom_type()->is_vect()->element_basic_type();
1194 }
1195
1196 inline BasicType vector_element_basic_type(const MachNode *use, MachOper* opnd) {
1197 uint def_idx = use->operand_index(opnd);
1198 Node* def = use->in(def_idx);
1199 return def->bottom_type()->is_vect()->element_basic_type();
1200 }
1201
1202 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1203 switch(bytes) {
1204 case 4: // fall-through
1205 case 8: // fall-through
1206 case 16: return Assembler::AVX_128bit;
1207 case 32: return Assembler::AVX_256bit;
1208 case 64: return Assembler::AVX_512bit;
1209
1210 default: {
1211 ShouldNotReachHere();
1212 return Assembler::AVX_NoVec;
1213 }
1214 }
1215 }
1216
1217 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1218 return vector_length_encoding(vector_length_in_bytes(n));
1219 }
1220
1221 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1222 uint def_idx = use->operand_index(opnd);
1223 Node* def = use->in(def_idx);
1224 return vector_length_encoding(def);
1225 }
1226
1227 class Node::PD {
1228 public:
1229 enum NodeFlags {
1230 Flag_intel_jcc_erratum = Node::_last_flag << 1,
1231 _last_flag = Flag_intel_jcc_erratum
1232 };
1233 };
1234
1235 %} // end source_hpp
1236
1237 source %{
1238
1239 #include "opto/addnode.hpp"
1240 #include "c2_intelJccErratum_x86.hpp"
1241
1242 void PhaseOutput::pd_perform_mach_node_analysis() {
1243 if (VM_Version::has_intel_jcc_erratum()) {
1244 int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1245 _buf_sizes._code += extra_padding;
1246 }
1304 Label next;
1305 // push a "the_pc" on the stack without destroying any registers
1306 // as they all may be live.
1307
1308 // push address of "next"
1309 __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1310 __ bind(next);
1311 // adjust it so it matches "the_pc"
1312 __ subptr(Address(rsp, 0), __ offset() - offset);
1313 #else
1314 InternalAddress here(__ pc());
1315 __ pushptr(here.addr());
1316 #endif
1317
1318 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1319 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1320 __ end_a_stub();
1321 return offset;
1322 }
1323
1324 Assembler::Width widthForType(BasicType bt) {
1325 if (bt == T_BYTE) {
1326 return Assembler::B;
1327 } else if (bt == T_SHORT) {
1328 return Assembler::W;
1329 } else if (bt == T_INT) {
1330 return Assembler::D;
1331 } else {
1332 assert(bt == T_LONG, "not a long: %s", type2name(bt));
1333 return Assembler::Q;
1334 }
1335 }
1336
1337 //=============================================================================
1338
1339 // Float masks come from different places depending on platform.
1340 #ifdef _LP64
1341 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); }
1342 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); }
1343 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1344 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1345 #else
1346 static address float_signmask() { return (address)float_signmask_pool; }
1347 static address float_signflip() { return (address)float_signflip_pool; }
1348 static address double_signmask() { return (address)double_signmask_pool; }
1349 static address double_signflip() { return (address)double_signflip_pool; }
1350 #endif
1351 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1352 static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1353 static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1354 static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1355 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1356 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1357 static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1358 static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1359 static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1360 static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1361 static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1362
1363 //=============================================================================
1364 const bool Matcher::match_rule_supported(int opcode) {
1365 if (!has_match_rule(opcode)) {
1366 return false; // no match rule present
1367 }
1368 switch (opcode) {
1369 case Op_AbsVL:
1370 case Op_StoreVectorScatter:
1371 if (UseAVX < 3) {
1372 return false;
1373 }
1374 break;
1375 case Op_PopCountI:
1376 case Op_PopCountL:
1377 if (!UsePopCountInstruction) {
1378 return false;
1379 }
1380 break;
1381 case Op_PopCountVI:
1382 if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1383 return false;
1384 }
1385 break;
1386 case Op_MulVI:
1387 if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1388 return false;
1389 }
1390 break;
1391 case Op_MulVL:
1392 if (UseSSE < 4) { // only with SSE4_1 or AVX
1393 return false;
1394 }
1395 break;
1396 case Op_MulReductionVL:
1397 if (VM_Version::supports_avx512dq() == false) {
1398 return false;
1399 }
1400 break;
1401 case Op_AddReductionVL:
1402 if (UseSSE < 2) { // requires at least SSE2
1403 return false;
1404 }
1405 break;
1406 case Op_AbsVB:
1407 case Op_AbsVS:
1408 case Op_AbsVI:
1409 case Op_AddReductionVI:
1410 case Op_AndReductionV:
1411 case Op_OrReductionV:
1412 case Op_XorReductionV:
1413 if (UseSSE < 3) { // requires at least SSSE3
1414 return false;
1415 }
1416 break;
1417 case Op_VectorLoadShuffle:
1418 case Op_VectorRearrange:
1419 case Op_MulReductionVI:
1420 if (UseSSE < 4) { // requires at least SSE4
1421 return false;
1422 }
1423 break;
1424 case Op_SqrtVD:
1425 case Op_SqrtVF:
1426 case Op_VectorMaskCmp:
1427 case Op_VectorCastB2X:
1428 case Op_VectorCastS2X:
1429 case Op_VectorCastI2X:
1430 case Op_VectorCastL2X:
1431 case Op_VectorCastF2X:
1432 case Op_VectorCastD2X:
1433 if (UseAVX < 1) { // enabled for AVX only
1434 return false;
1435 }
1436 break;
1437 case Op_CompareAndSwapL:
1438 #ifdef _LP64
1439 case Op_CompareAndSwapP:
1440 #endif
1441 if (!VM_Version::supports_cx8()) {
1442 return false;
1443 }
1444 break;
1445 case Op_CMoveVF:
1446 case Op_CMoveVD:
1447 if (UseAVX < 1) { // enabled for AVX only
1448 return false;
1449 }
1450 break;
1451 case Op_StrIndexOf:
1452 if (!UseSSE42Intrinsics) {
1453 return false;
1454 }
1455 break;
1456 case Op_StrIndexOfChar:
1457 if (!UseSSE42Intrinsics) {
1458 return false;
1459 }
1460 break;
1461 case Op_OnSpinWait:
1462 if (VM_Version::supports_on_spin_wait() == false) {
1463 return false;
1464 }
1465 break;
1466 case Op_MulVB:
1467 case Op_LShiftVB:
1468 case Op_RShiftVB:
1469 case Op_URShiftVB:
1470 case Op_VectorInsert:
1471 case Op_VectorLoadMask:
1472 case Op_VectorStoreMask:
1473 case Op_VectorBlend:
1474 if (UseSSE < 4) {
1475 return false;
1476 }
1477 break;
1478 #ifdef _LP64
1479 case Op_MaxD:
1480 case Op_MaxF:
1481 case Op_MinD:
1482 case Op_MinF:
1483 if (UseAVX < 1) { // enabled for AVX only
1484 return false;
1485 }
1486 break;
1487 #endif
1488 case Op_CacheWB:
1489 case Op_CacheWBPreSync:
1490 case Op_CacheWBPostSync:
1491 if (!VM_Version::supports_data_cache_line_flush()) {
1492 return false;
1493 }
1494 break;
1495 case Op_ExtractB:
1496 case Op_ExtractL:
1497 case Op_ExtractI:
1498 case Op_RoundDoubleMode:
1499 if (UseSSE < 4) {
1500 return false;
1501 }
1502 break;
1503 case Op_RoundDoubleModeV:
1504 if (VM_Version::supports_avx() == false) {
1505 return false; // 128bit vroundpd is not available
1506 }
1507 break;
1508 case Op_LoadVectorGather:
1509 if (UseAVX < 2) {
1510 return false;
1511 }
1512 break;
1513 case Op_FmaVD:
1514 case Op_FmaVF:
1515 if (!UseFMA) {
1516 return false;
1517 }
1518 break;
1519 case Op_MacroLogicV:
1520 if (UseAVX < 3 || !UseVectorMacroLogic) {
1521 return false;
1522 }
1523 break;
1524 #ifndef _LP64
1525 case Op_AddReductionVF:
1526 case Op_AddReductionVD:
1527 case Op_MulReductionVF:
1528 case Op_MulReductionVD:
1529 if (UseSSE < 1) { // requires at least SSE
1530 return false;
1531 }
1532 break;
1533 case Op_MulAddVS2VI:
1534 case Op_RShiftVL:
1535 case Op_AbsVD:
1536 case Op_NegVD:
1537 if (UseSSE < 2) {
1538 return false;
1559 // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1560 // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1561 // And MaxVectorSize is taken into account as well.
1562 if (!vector_size_supported(bt, vlen)) {
1563 return false;
1564 }
1565 // Special cases which require vector length follow:
1566 // * implementation limitations
1567 // * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1568 // * 128bit vroundpd instruction is present only in AVX1
1569 int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1570 switch (opcode) {
1571 case Op_AbsVF:
1572 case Op_NegVF:
1573 if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1574 return false; // 512bit vandps and vxorps are not available
1575 }
1576 break;
1577 case Op_AbsVD:
1578 case Op_NegVD:
1579 case Op_MulVL:
1580 if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1581 return false; // 512bit vpmullq, vandpd and vxorpd are not available
1582 }
1583 break;
1584 case Op_CMoveVF:
1585 if (vlen != 8) {
1586 return false; // implementation limitation (only vcmov8F_reg is present)
1587 }
1588 break;
1589 case Op_MacroLogicV:
1590 if (!VM_Version::supports_evex() ||
1591 ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1592 return false;
1593 }
1594 break;
1595 case Op_CMoveVD:
1596 if (vlen != 4) {
1597 return false; // implementation limitation (only vcmov4D_reg is present)
1598 }
1599 break;
1600 case Op_MaxV:
1601 case Op_MinV:
1602 if (UseSSE < 4 && is_integral_type(bt)) {
1603 return false;
1604 }
1605 if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1606 // Float/Double intrinsics are enabled for AVX family currently.
1607 if (UseAVX == 0) {
1608 return false;
1609 }
1610 if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1611 return false;
1612 }
1613 }
1614 break;
1615 case Op_AddReductionVI:
1616 if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1617 return false;
1618 }
1619 // fallthrough
1620 case Op_AndReductionV:
1621 case Op_OrReductionV:
1622 case Op_XorReductionV:
1623 if (is_subword_type(bt) && (UseSSE < 4)) {
1624 return false;
1625 }
1626 #ifndef _LP64
1627 if (bt == T_BYTE || bt == T_LONG) {
1628 return false;
1629 }
1630 #endif
1631 break;
1632 #ifndef _LP64
1633 case Op_VectorInsert:
1634 if (bt == T_LONG || bt == T_DOUBLE) {
1635 return false;
1636 }
1637 break;
1638 #endif
1639 case Op_MinReductionV:
1640 case Op_MaxReductionV:
1641 if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1642 return false;
1643 } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1644 return false;
1645 }
1646 // Float/Double intrinsics enabled for AVX family.
1647 if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1648 return false;
1649 }
1650 if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1651 return false;
1652 }
1653 #ifndef _LP64
1654 if (bt == T_BYTE || bt == T_LONG) {
1655 return false;
1656 }
1657 #endif
1658 break;
1659 case Op_VectorTest:
1660 if (UseSSE < 4) {
1661 return false; // Implementation limitation
1662 } else if (size_in_bits < 128) {
1663 return false; // Implementation limitation
1664 } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1665 return false; // Implementation limitation
1666 }
1667 break;
1668 case Op_VectorLoadShuffle:
1669 case Op_VectorRearrange:
1670 if(vlen == 2) {
1671 return false; // Implementation limitation due to how shuffle is loaded
1672 } else if (size_in_bits == 256 && UseAVX < 2) {
1673 return false; // Implementation limitation
1674 } else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512_vbmi()) {
1675 return false; // Implementation limitation
1676 } else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512bw()) {
1677 return false; // Implementation limitation
1678 }
1679 break;
1680 case Op_VectorLoadMask:
1681 if (size_in_bits == 256 && UseAVX < 2) {
1682 return false; // Implementation limitation
1683 }
1684 // fallthrough
1685 case Op_VectorStoreMask:
1686 if (vlen == 2) {
1687 return false; // Implementation limitation
1688 }
1689 break;
1690 case Op_VectorCastB2X:
1691 if (size_in_bits == 256 && UseAVX < 2) {
1692 return false; // Implementation limitation
1693 }
1694 break;
1695 case Op_VectorCastS2X:
1696 if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1697 return false;
1698 }
1699 break;
1700 case Op_VectorCastI2X:
1701 if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1702 return false;
1703 }
1704 break;
1705 case Op_VectorCastL2X:
1706 if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1707 return false;
1708 } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1709 return false;
1710 }
1711 break;
1712 case Op_VectorCastF2X:
1713 case Op_VectorCastD2X:
1714 if (is_integral_type(bt)) {
1715 // Casts from FP to integral types require special fixup logic not easily
1716 // implementable with vectors.
1717 return false; // Implementation limitation
1718 }
1719 case Op_MulReductionVI:
1720 if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1721 return false;
1722 }
1723 break;
1724 case Op_StoreVectorScatter:
1725 if(bt == T_BYTE || bt == T_SHORT) {
1726 return false;
1727 } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1728 return false;
1729 }
1730 // fallthrough
1731 case Op_LoadVectorGather:
1732 if (size_in_bits == 64 ) {
1733 return false;
1734 }
1735 break;
1736 }
1737 return true; // Per default match rules are supported.
1738 }
1739
1740 // x86 supports generic vector operands: vec and legVec.
1741 const bool Matcher::supports_generic_vector_operands = true;
1742
1743 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1744 assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1745 bool legacy = (generic_opnd->opcode() == LEGVEC);
1746 if (!VM_Version::supports_avx512vlbwdq() && // KNL
1747 is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1748 // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1749 return new legVecZOper();
1750 }
1751 if (legacy) {
1752 switch (ideal_reg) {
1753 case Op_VecS: return new legVecSOper();
1754 case Op_VecD: return new legVecDOper();
1755 case Op_VecX: return new legVecXOper();
1774 case MoveVec2Leg_rule:
1775 case MoveLeg2Vec_rule:
1776 return true;
1777 default:
1778 return false;
1779 }
1780 }
1781
1782 bool Matcher::is_generic_vector(MachOper* opnd) {
1783 switch (opnd->opcode()) {
1784 case VEC:
1785 case LEGVEC:
1786 return true;
1787 default:
1788 return false;
1789 }
1790 }
1791
1792 //------------------------------------------------------------------------
1793
1794 bool Matcher::supports_vector_variable_shifts(void) {
1795 return (UseAVX >= 2);
1796 }
1797
1798 const bool Matcher::has_predicated_vectors(void) {
1799 bool ret_value = false;
1800 if (UseAVX > 2) {
1801 ret_value = VM_Version::supports_avx512vl();
1802 }
1803
1804 return ret_value;
1805 }
1806
1807 const int Matcher::float_pressure(int default_pressure_threshold) {
1808 int float_pressure_threshold = default_pressure_threshold;
1809 #ifdef _LP64
1810 if (UseAVX > 2) {
1811 // Increase pressure threshold on machines with AVX3 which have
1812 // 2x more XMM registers.
1813 float_pressure_threshold = default_pressure_threshold * 2;
1814 }
1815 #endif
1816 return float_pressure_threshold;
1817 }
2061 } else {
2062 mstack.push(adr, Pre_Visit);
2063 }
2064
2065 // Clone X+offset as it also folds into most addressing expressions
2066 mstack.push(off, Visit);
2067 mstack.push(m->in(AddPNode::Base), Pre_Visit);
2068 return true;
2069 } else if (clone_shift(off, this, mstack, address_visited)) {
2070 address_visited.test_set(m->_idx); // Flag as address_visited
2071 mstack.push(m->in(AddPNode::Address), Pre_Visit);
2072 mstack.push(m->in(AddPNode::Base), Pre_Visit);
2073 return true;
2074 }
2075 return false;
2076 }
2077
2078 void Compile::reshape_address(AddPNode* addp) {
2079 }
2080
2081 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2082 switch (bt) {
2083 case BoolTest::eq: return Assembler::eq;
2084 case BoolTest::ne: return Assembler::neq;
2085 case BoolTest::le: return Assembler::le;
2086 case BoolTest::ge: return Assembler::nlt;
2087 case BoolTest::lt: return Assembler::lt;
2088 case BoolTest::gt: return Assembler::nle;
2089 default : ShouldNotReachHere(); return Assembler::_false;
2090 }
2091 }
2092
2093 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2094 switch (bt) {
2095 case BoolTest::eq: return Assembler::EQ_OQ; // ordered non-signaling
2096 // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2097 case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2098 case BoolTest::le: return Assembler::LE_OQ; // ordered non-signaling
2099 case BoolTest::ge: return Assembler::GE_OQ; // ordered non-signaling
2100 case BoolTest::lt: return Assembler::LT_OQ; // ordered non-signaling
2101 case BoolTest::gt: return Assembler::GT_OQ; // ordered non-signaling
2102 default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2103 }
2104 }
2105
2106 // Helper methods for MachSpillCopyNode::implementation().
2107 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
2108 int src_hi, int dst_hi, uint ireg, outputStream* st) {
2109 // In 64-bit VM size calculation is very complex. Emitting instructions
2110 // into scratch buffer is used to get size in 64-bit VM.
2111 LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
2112 assert(ireg == Op_VecS || // 32bit vector
2113 (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2114 (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2115 "no non-adjacent vector moves" );
2116 if (cbuf) {
2117 C2_MacroAssembler _masm(cbuf);
2118 int offset = __ offset();
2119 switch (ireg) {
2120 case Op_VecS: // copy whole register
2121 case Op_VecD:
2122 case Op_VecX:
2409 %}
2410
2411 encode %{
2412
2413 enc_class call_epilog %{
2414 if (VerifyStackAtCalls) {
2415 // Check that stack depth is unchanged: find majik cookie on stack
2416 int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2417 C2_MacroAssembler _masm(&cbuf);
2418 Label L;
2419 __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2420 __ jccb(Assembler::equal, L);
2421 // Die if stack mismatch
2422 __ int3();
2423 __ bind(L);
2424 }
2425 %}
2426
2427 %}
2428
2429 // Operands for bound floating pointer register arguments
2430 operand rxmm0() %{
2431 constraint(ALLOC_IN_RC(xmm0_reg));
2432 match(VecX);
2433 format%{%}
2434 interface(REG_INTER);
2435 %}
2436
2437 //----------OPERANDS-----------------------------------------------------------
2438 // Operand definitions must precede instruction definitions for correct parsing
2439 // in the ADLC because operands constitute user defined types which are used in
2440 // instruction definitions.
2441
2442 // Vectors
2443
2444 // Dummy generic vector class. Should be used for all vector operands.
2445 // Replaced with vec[SDXYZ] during post-selection pass.
2446 operand vec() %{
2447 constraint(ALLOC_IN_RC(dynamic));
2448 match(VecX);
2449 match(VecY);
2450 match(VecZ);
2451 match(VecS);
2452 match(VecD);
2453
2454 format %{ %}
2455 interface(REG_INTER);
3182 ins_pipe(pipe_slow);
3183 %}
3184
3185 instruct absF_reg(regF dst) %{
3186 predicate((UseSSE>=1) && (UseAVX == 0));
3187 match(Set dst (AbsF dst));
3188 ins_cost(150);
3189 format %{ "andps $dst, [0x7fffffff]\t# abs float by sign masking" %}
3190 ins_encode %{
3191 __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3192 %}
3193 ins_pipe(pipe_slow);
3194 %}
3195
3196 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3197 predicate(UseAVX > 0);
3198 match(Set dst (AbsF src));
3199 ins_cost(150);
3200 format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3201 ins_encode %{
3202 int vlen_enc = Assembler::AVX_128bit;
3203 __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3204 ExternalAddress(float_signmask()), vlen_enc);
3205 %}
3206 ins_pipe(pipe_slow);
3207 %}
3208
3209 instruct absD_reg(regD dst) %{
3210 predicate((UseSSE>=2) && (UseAVX == 0));
3211 match(Set dst (AbsD dst));
3212 ins_cost(150);
3213 format %{ "andpd $dst, [0x7fffffffffffffff]\t"
3214 "# abs double by sign masking" %}
3215 ins_encode %{
3216 __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3217 %}
3218 ins_pipe(pipe_slow);
3219 %}
3220
3221 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3222 predicate(UseAVX > 0);
3223 match(Set dst (AbsD src));
3224 ins_cost(150);
3225 format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t"
3226 "# abs double by sign masking" %}
3227 ins_encode %{
3228 int vlen_enc = Assembler::AVX_128bit;
3229 __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3230 ExternalAddress(double_signmask()), vlen_enc);
3231 %}
3232 ins_pipe(pipe_slow);
3233 %}
3234
3235 instruct negF_reg(regF dst) %{
3236 predicate((UseSSE>=1) && (UseAVX == 0));
3237 match(Set dst (NegF dst));
3238 ins_cost(150);
3239 format %{ "xorps $dst, [0x80000000]\t# neg float by sign flipping" %}
3240 ins_encode %{
3241 __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3242 %}
3243 ins_pipe(pipe_slow);
3244 %}
3245
3246 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3247 predicate(UseAVX > 0);
3248 match(Set dst (NegF src));
3249 ins_cost(150);
3250 format %{ "vnegatess $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3334
3335 format %{ "sqrtsd $dst, $src" %}
3336 ins_cost(150);
3337 ins_encode %{
3338 __ sqrtsd($dst$$XMMRegister, $src$$Address);
3339 %}
3340 ins_pipe(pipe_slow);
3341 %}
3342
3343 instruct sqrtD_imm(regD dst, immD con) %{
3344 predicate(UseSSE>=2);
3345 match(Set dst (SqrtD con));
3346 format %{ "sqrtsd $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3347 ins_cost(150);
3348 ins_encode %{
3349 __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3350 %}
3351 ins_pipe(pipe_slow);
3352 %}
3353
3354 // ---------------------------------------- VectorReinterpret ------------------------------------
3355
3356 instruct reinterpret(vec dst) %{
3357 predicate(vector_length_in_bytes(n) == vector_length_in_bytes(n->in(1))); // dst == src
3358 match(Set dst (VectorReinterpret dst));
3359 ins_cost(125);
3360 format %{ "vector_reinterpret $dst\t!" %}
3361 ins_encode %{
3362 // empty
3363 %}
3364 ins_pipe( pipe_slow );
3365 %}
3366
3367 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3368 predicate(UseAVX == 0 &&
3369 (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3370 match(Set dst (VectorReinterpret src));
3371 ins_cost(125);
3372 effect(TEMP dst, TEMP scratch);
3373 format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3374 ins_encode %{
3375 assert(vector_length_in_bytes(this) <= 16, "required");
3376 assert(vector_length_in_bytes(this, $src) <= 8, "required");
3377
3378 int src_vlen_in_bytes = vector_length_in_bytes(this, $src);
3379 if (src_vlen_in_bytes == 4) {
3380 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3381 } else {
3382 assert(src_vlen_in_bytes == 8, "");
3383 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3384 }
3385 __ pand($dst$$XMMRegister, $src$$XMMRegister);
3386 %}
3387 ins_pipe( pipe_slow );
3388 %}
3389
3390 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3391 predicate(UseAVX > 0 &&
3392 (vector_length_in_bytes(n->in(1)) == 4) && // src
3393 (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3394 match(Set dst (VectorReinterpret src));
3395 ins_cost(125);
3396 effect(TEMP scratch);
3397 format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3398 ins_encode %{
3399 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3400 %}
3401 ins_pipe( pipe_slow );
3402 %}
3403
3404
3405 instruct vreinterpret_expand(legVec dst, vec src) %{
3406 predicate(UseAVX > 0 &&
3407 (vector_length_in_bytes(n->in(1)) > 4) && // src
3408 (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3409 match(Set dst (VectorReinterpret src));
3410 ins_cost(125);
3411 format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3412 ins_encode %{
3413 switch (vector_length_in_bytes(this, $src)) {
3414 case 8: __ movq ($dst$$XMMRegister, $src$$XMMRegister); break;
3415 case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3416 case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3417 default: ShouldNotReachHere();
3418 }
3419 %}
3420 ins_pipe( pipe_slow );
3421 %}
3422
3423 instruct reinterpret_shrink(vec dst, legVec src) %{
3424 predicate(vector_length_in_bytes(n->in(1)) > vector_length_in_bytes(n)); // src > dst
3425 match(Set dst (VectorReinterpret src));
3426 ins_cost(125);
3427 format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3428 ins_encode %{
3429 switch (vector_length_in_bytes(this)) {
3430 case 4: __ movflt ($dst$$XMMRegister, $src$$XMMRegister); break;
3431 case 8: __ movq ($dst$$XMMRegister, $src$$XMMRegister); break;
3432 case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3433 case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3434 default: ShouldNotReachHere();
3435 }
3436 %}
3437 ins_pipe( pipe_slow );
3438 %}
3439
3440 // ----------------------------------------------------------------------------------------------------
3441
3442 #ifdef _LP64
3443 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3444 match(Set dst (RoundDoubleMode src rmode));
3445 format %{ "roundsd $dst,$src" %}
3446 ins_cost(150);
3447 ins_encode %{
3448 assert(UseSSE >= 4, "required");
3449 __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3450 %}
3451 ins_pipe(pipe_slow);
3452 %}
3453
3454 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3455 match(Set dst (RoundDoubleMode (LoadD src) rmode));
3456 format %{ "roundsd $dst,$src" %}
3457 ins_cost(150);
3458 ins_encode %{
3459 assert(UseSSE >= 4, "required");
3460 __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3461 %}
3462 ins_pipe(pipe_slow);
3463 %}
3464
3465 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3466 match(Set dst (RoundDoubleMode con rmode));
3467 effect(TEMP scratch_reg);
3468 format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3469 ins_cost(150);
3470 ins_encode %{
3471 assert(UseSSE >= 4, "required");
3472 __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3473 %}
3474 ins_pipe(pipe_slow);
3475 %}
3476
3477 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3478 predicate(vector_length(n) < 8);
3479 match(Set dst (RoundDoubleModeV src rmode));
3480 format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3481 ins_encode %{
3482 assert(UseAVX > 0, "required");
3483 int vlen_enc = vector_length_encoding(this);
3484 __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3485 %}
3486 ins_pipe( pipe_slow );
3487 %}
3488
3489 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3490 predicate(vector_length(n) == 8);
3491 match(Set dst (RoundDoubleModeV src rmode));
3492 format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3493 ins_encode %{
3494 assert(UseAVX > 2, "required");
3495 __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3496 %}
3497 ins_pipe( pipe_slow );
3498 %}
3499
3500 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3501 predicate(vector_length(n) < 8);
3502 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3503 format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3504 ins_encode %{
3505 assert(UseAVX > 0, "required");
3506 int vlen_enc = vector_length_encoding(this);
3507 __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3508 %}
3509 ins_pipe( pipe_slow );
3510 %}
3511
3512 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3513 predicate(vector_length(n) == 8);
3514 match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3515 format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3516 ins_encode %{
3517 assert(UseAVX > 2, "required");
3518 __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3519 %}
3520 ins_pipe( pipe_slow );
3521 %}
3522 #endif // _LP64
3523
3524 instruct onspinwait() %{
3525 match(OnSpinWait);
3526 ins_cost(200);
3527
3528 format %{
3529 $$template
3530 $$emit$$"pause\t! membar_onspinwait"
3531 %}
3532 ins_encode %{
3533 __ pause();
3565 instruct MoveVec2Leg(legVec dst, vec src) %{
3566 match(Set dst src);
3567 format %{ "" %}
3568 ins_encode %{
3569 ShouldNotReachHere();
3570 %}
3571 ins_pipe( fpu_reg_reg );
3572 %}
3573
3574 instruct MoveLeg2Vec(vec dst, legVec src) %{
3575 match(Set dst src);
3576 format %{ "" %}
3577 ins_encode %{
3578 ShouldNotReachHere();
3579 %}
3580 ins_pipe( fpu_reg_reg );
3581 %}
3582
3583 // ============================================================================
3584
3585 // Load vectors generic operand pattern
3586 instruct loadV(vec dst, memory mem) %{
3587 match(Set dst (LoadVector mem));
3588 ins_cost(125);
3589 format %{ "load_vector $dst,$mem" %}
3590 ins_encode %{
3591 switch (vector_length_in_bytes(this)) {
3592 case 4: __ movdl ($dst$$XMMRegister, $mem$$Address); break;
3593 case 8: __ movq ($dst$$XMMRegister, $mem$$Address); break;
3594 case 16: __ movdqu ($dst$$XMMRegister, $mem$$Address); break;
3595 case 32: __ vmovdqu ($dst$$XMMRegister, $mem$$Address); break;
3596 case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3597 default: ShouldNotReachHere();
3598 }
3599 %}
3600 ins_pipe( pipe_slow );
3601 %}
3602
3603 // Store vectors generic operand pattern.
3604 instruct storeV(memory mem, vec src) %{
3605 match(Set mem (StoreVector mem src));
3606 ins_cost(145);
3607 format %{ "store_vector $mem,$src\n\t" %}
3608 ins_encode %{
3609 switch (vector_length_in_bytes(this, $src)) {
3610 case 4: __ movdl ($mem$$Address, $src$$XMMRegister); break;
3611 case 8: __ movq ($mem$$Address, $src$$XMMRegister); break;
3612 case 16: __ movdqu ($mem$$Address, $src$$XMMRegister); break;
3613 case 32: __ vmovdqu ($mem$$Address, $src$$XMMRegister); break;
3614 case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3615 default: ShouldNotReachHere();
3616 }
3617 %}
3618 ins_pipe( pipe_slow );
3619 %}
3620
3621 // ---------------------------------------- Gather ------------------------------------
3622
3623 // Gather INT, LONG, FLOAT, DOUBLE
3624
3625 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3626 predicate(vector_length_in_bytes(n) <= 32);
3627 match(Set dst (LoadVectorGather mem idx));
3628 effect(TEMP dst, TEMP tmp, TEMP mask);
3629 format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3630 ins_encode %{
3631 assert(UseAVX >= 2, "sanity");
3632
3633 int vlen_enc = vector_length_encoding(this);
3634 BasicType elem_bt = vector_element_basic_type(this);
3635
3636 assert(vector_length_in_bytes(this) >= 16, "sanity");
3637 assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3638
3639 if (vlen_enc == Assembler::AVX_128bit) {
3640 __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3641 } else {
3642 __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3643 }
3644 __ lea($tmp$$Register, $mem$$Address);
3645 __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3646 %}
3647 ins_pipe( pipe_slow );
3648 %}
3649
3650 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp) %{
3651 predicate(vector_length_in_bytes(n) == 64);
3652 match(Set dst (LoadVectorGather mem idx));
3653 effect(TEMP dst, TEMP tmp);
3654 format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %}
3655 ins_encode %{
3656 assert(UseAVX > 2, "sanity");
3657
3658 int vlen_enc = vector_length_encoding(this);
3659 BasicType elem_bt = vector_element_basic_type(this);
3660
3661 assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3662
3663 KRegister ktmp = k2;
3664 __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3665 __ lea($tmp$$Register, $mem$$Address);
3666 __ evgather(elem_bt, $dst$$XMMRegister, ktmp, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3667 %}
3668 ins_pipe( pipe_slow );
3669 %}
3670
3671 // ====================Scatter=======================================
3672
3673 // Scatter INT, LONG, FLOAT, DOUBLE
3674
3675 instruct scatter(memory mem, vec src, vec idx, rRegP tmp) %{
3676 match(Set mem (StoreVectorScatter mem (Binary src idx)));
3677 effect(TEMP tmp);
3678 format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3679 ins_encode %{
3680 assert(UseAVX > 2, "sanity");
3681
3682 int vlen_enc = vector_length_encoding(this, $src);
3683 BasicType elem_bt = vector_element_basic_type(this, $src);
3684
3685 assert(vector_length_in_bytes(this, $src) >= 16, "sanity");
3686 assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3687
3688 KRegister ktmp = k2;
3689 __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3690 __ lea($tmp$$Register, $mem$$Address);
3691 __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, ktmp, $src$$XMMRegister, vlen_enc);
3692 %}
3693 ins_pipe( pipe_slow );
3694 %}
3695
3696 // ====================REPLICATE=======================================
3697
3698 // Replicate byte scalar to be vector
3699 instruct ReplB_reg(vec dst, rRegI src) %{
3700 match(Set dst (ReplicateB src));
3701 format %{ "replicateB $dst,$src" %}
3702 ins_encode %{
3703 uint vlen = vector_length(this);
3704 if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3705 assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3706 int vlen_enc = vector_length_encoding(this);
3707 __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3708 } else {
3709 __ movdl($dst$$XMMRegister, $src$$Register);
3710 __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3711 __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3712 if (vlen >= 16) {
3713 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3714 if (vlen >= 32) {
3715 assert(vlen == 32, "sanity");
3716 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3717 }
3718 }
3719 }
3720 %}
3721 ins_pipe( pipe_slow );
3722 %}
3723
3724 instruct ReplB_mem(vec dst, memory mem) %{
3725 predicate(VM_Version::supports_avx2());
3726 match(Set dst (ReplicateB (LoadB mem)));
3727 format %{ "replicateB $dst,$mem" %}
3728 ins_encode %{
3729 int vlen_enc = vector_length_encoding(this);
3730 __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3731 %}
3732 ins_pipe( pipe_slow );
3733 %}
3734
3735 instruct ReplB_imm(vec dst, immI con) %{
3736 match(Set dst (ReplicateB con));
3737 format %{ "replicateB $dst,$con" %}
3738 ins_encode %{
3739 uint vlen = vector_length(this);
3740 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3741 if (vlen == 4) {
3742 __ movdl($dst$$XMMRegister, const_addr);
3743 } else {
3744 __ movq($dst$$XMMRegister, const_addr);
3745 if (vlen >= 16) {
3746 if (VM_Version::supports_avx2()) {
3747 int vlen_enc = vector_length_encoding(this);
3748 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3749 } else {
3750 assert(vlen == 16, "sanity");
3751 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3752 }
3753 }
3754 }
3755 %}
3756 ins_pipe( pipe_slow );
3757 %}
3758
3759 // Replicate byte scalar zero to be vector
3760 instruct ReplB_zero(vec dst, immI_0 zero) %{
3761 match(Set dst (ReplicateB zero));
3762 format %{ "replicateB $dst,$zero" %}
3763 ins_encode %{
3764 uint vlen = vector_length(this);
3765 if (vlen <= 16) {
3766 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3767 } else {
3768 // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3769 int vlen_enc = vector_length_encoding(this);
3770 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3771 }
3772 %}
3773 ins_pipe( fpu_reg_reg );
3774 %}
3775
3776 // ====================ReplicateS=======================================
3777
3778 instruct ReplS_reg(vec dst, rRegI src) %{
3779 match(Set dst (ReplicateS src));
3780 format %{ "replicateS $dst,$src" %}
3817 uint vlen = vector_length(this);
3818 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3819 if (vlen == 2) {
3820 __ movdl($dst$$XMMRegister, const_addr);
3821 } else {
3822 __ movq($dst$$XMMRegister, const_addr);
3823 if (vlen >= 8) {
3824 if (VM_Version::supports_avx2()) {
3825 int vlen_enc = vector_length_encoding(this);
3826 __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3827 } else {
3828 assert(vlen == 8, "sanity");
3829 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3830 }
3831 }
3832 }
3833 %}
3834 ins_pipe( fpu_reg_reg );
3835 %}
3836
3837 instruct ReplS_zero(vec dst, immI_0 zero) %{
3838 match(Set dst (ReplicateS zero));
3839 format %{ "replicateS $dst,$zero" %}
3840 ins_encode %{
3841 uint vlen = vector_length(this);
3842 if (vlen <= 8) {
3843 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3844 } else {
3845 int vlen_enc = vector_length_encoding(this);
3846 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3847 }
3848 %}
3849 ins_pipe( fpu_reg_reg );
3850 %}
3851
3852 // ====================ReplicateI=======================================
3853
3854 instruct ReplI_reg(vec dst, rRegI src) %{
3855 match(Set dst (ReplicateI src));
3856 format %{ "replicateI $dst,$src" %}
3857 ins_encode %{
3864 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3865 if (vlen >= 8) {
3866 assert(vlen == 8, "sanity");
3867 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3868 }
3869 }
3870 %}
3871 ins_pipe( pipe_slow );
3872 %}
3873
3874 instruct ReplI_mem(vec dst, memory mem) %{
3875 match(Set dst (ReplicateI (LoadI mem)));
3876 format %{ "replicateI $dst,$mem" %}
3877 ins_encode %{
3878 uint vlen = vector_length(this);
3879 if (vlen <= 4) {
3880 __ movdl($dst$$XMMRegister, $mem$$Address);
3881 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3882 } else {
3883 assert(VM_Version::supports_avx2(), "sanity");
3884 int vlen_enc = vector_length_encoding(this);
3885 __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3886 }
3887 %}
3888 ins_pipe( pipe_slow );
3889 %}
3890
3891 instruct ReplI_imm(vec dst, immI con) %{
3892 match(Set dst (ReplicateI con));
3893 format %{ "replicateI $dst,$con" %}
3894 ins_encode %{
3895 uint vlen = vector_length(this);
3896 InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3897 if (vlen <= 4) {
3898 __ movq($dst$$XMMRegister, const_addr);
3899 if (vlen == 4) {
3900 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3901 }
3902 } else {
3903 assert(VM_Version::supports_avx2(), "sanity");
3904 int vlen_enc = vector_length_encoding(this);
3905 __ movq($dst$$XMMRegister, const_addr);
3906 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3907 }
3908 %}
3909 ins_pipe( pipe_slow );
3910 %}
3911
3912 // Replicate integer (4 byte) scalar zero to be vector
3913 instruct ReplI_zero(vec dst, immI_0 zero) %{
3914 match(Set dst (ReplicateI zero));
3915 format %{ "replicateI $dst,$zero" %}
3916 ins_encode %{
3917 uint vlen = vector_length(this);
3918 if (vlen <= 4) {
3919 __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3920 } else {
3921 int vlen_enc = vector_length_encoding(this);
3922 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3923 }
3924 %}
3925 ins_pipe( fpu_reg_reg );
3926 %}
3927
3928 instruct ReplI_M1(vec dst, immI_M1 con) %{
3929 predicate(UseAVX > 0);
3930 match(Set dst (ReplicateB con));
3931 match(Set dst (ReplicateS con));
3932 match(Set dst (ReplicateI con));
3933 effect(TEMP dst);
3949 ins_encode %{
3950 uint vlen = vector_length(this);
3951 if (vlen == 2) {
3952 __ movdq($dst$$XMMRegister, $src$$Register);
3953 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3954 } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3955 int vlen_enc = vector_length_encoding(this);
3956 __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3957 } else {
3958 assert(vlen == 4, "sanity");
3959 __ movdq($dst$$XMMRegister, $src$$Register);
3960 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3961 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3962 }
3963 %}
3964 ins_pipe( pipe_slow );
3965 %}
3966 #else // _LP64
3967 // Replicate long (8 byte) scalar to be vector
3968 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3969 predicate(vector_length(n) <= 4);
3970 match(Set dst (ReplicateL src));
3971 effect(TEMP dst, USE src, TEMP tmp);
3972 format %{ "replicateL $dst,$src" %}
3973 ins_encode %{
3974 uint vlen = vector_length(this);
3975 if (vlen == 2) {
3976 __ movdl($dst$$XMMRegister, $src$$Register);
3977 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3978 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3979 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3980 } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3981 int vlen_enc = Assembler::AVX_256bit;
3982 __ movdl($dst$$XMMRegister, $src$$Register);
3983 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3984 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3985 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3986 } else {
3987 __ movdl($dst$$XMMRegister, $src$$Register);
3988 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3989 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3990 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3991 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3992 }
3993 %}
3994 ins_pipe( pipe_slow );
3995 %}
3996
3997 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3998 predicate(vector_length(n) == 8);
3999 match(Set dst (ReplicateL src));
4000 effect(TEMP dst, USE src, TEMP tmp);
4001 format %{ "replicateL $dst,$src" %}
4002 ins_encode %{
4003 if (VM_Version::supports_avx512vl()) {
4004 __ movdl($dst$$XMMRegister, $src$$Register);
4005 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4006 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4007 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4008 __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4009 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4010 } else {
4011 int vlen_enc = Assembler::AVX_512bit;
4012 __ movdl($dst$$XMMRegister, $src$$Register);
4013 __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4014 __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4015 __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4016 }
4017 %}
4018 ins_pipe( pipe_slow );
4019 %}
4020 #endif // _LP64
4021
4022 instruct ReplL_mem(vec dst, memory mem) %{
4023 match(Set dst (ReplicateL (LoadL mem)));
4024 format %{ "replicateL $dst,$mem" %}
4025 ins_encode %{
4026 uint vlen = vector_length(this);
4027 if (vlen == 2) {
4028 __ movq($dst$$XMMRegister, $mem$$Address);
4029 __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4030 } else {
4031 assert(VM_Version::supports_avx2(), "sanity");
4032 int vlen_enc = vector_length_encoding(this);
4033 __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4034 }
4035 %}
4076 match(Set dst (ReplicateL con));
4077 effect(TEMP dst);
4078 format %{ "vallones $dst" %}
4079 ins_encode %{
4080 int vector_len = vector_length_encoding(this);
4081 __ vallones($dst$$XMMRegister, vector_len);
4082 %}
4083 ins_pipe( pipe_slow );
4084 %}
4085
4086 // ====================ReplicateF=======================================
4087
4088 instruct ReplF_reg(vec dst, vlRegF src) %{
4089 match(Set dst (ReplicateF src));
4090 format %{ "replicateF $dst,$src" %}
4091 ins_encode %{
4092 uint vlen = vector_length(this);
4093 if (vlen <= 4) {
4094 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4095 } else if (VM_Version::supports_avx2()) {
4096 int vlen_enc = vector_length_encoding(this);
4097 __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4098 } else {
4099 assert(vlen == 8, "sanity");
4100 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4101 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4102 }
4103 %}
4104 ins_pipe( pipe_slow );
4105 %}
4106
4107 instruct ReplF_mem(vec dst, memory mem) %{
4108 match(Set dst (ReplicateF (LoadF mem)));
4109 format %{ "replicateF $dst,$mem" %}
4110 ins_encode %{
4111 uint vlen = vector_length(this);
4112 if (vlen <= 4) {
4113 __ movdl($dst$$XMMRegister, $mem$$Address);
4114 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4115 } else {
4116 assert(VM_Version::supports_avx(), "sanity");
4117 int vlen_enc = vector_length_encoding(this);
4118 __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4119 }
4120 %}
4121 ins_pipe( pipe_slow );
4122 %}
4123
4124 instruct ReplF_zero(vec dst, immF0 zero) %{
4125 match(Set dst (ReplicateF zero));
4126 format %{ "replicateF $dst,$zero" %}
4127 ins_encode %{
4128 uint vlen = vector_length(this);
4129 if (vlen <= 4) {
4130 __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4131 } else {
4132 int vlen_enc = vector_length_encoding(this);
4133 __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4134 }
4135 %}
4136 ins_pipe( fpu_reg_reg );
4137 %}
4138
4139 // ====================ReplicateD=======================================
4140
4141 // Replicate double (8 bytes) scalar to be vector
4142 instruct ReplD_reg(vec dst, vlRegD src) %{
4143 match(Set dst (ReplicateD src));
4144 format %{ "replicateD $dst,$src" %}
4145 ins_encode %{
4146 uint vlen = vector_length(this);
4147 if (vlen == 2) {
4148 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4149 } else if (VM_Version::supports_avx2()) {
4150 int vlen_enc = vector_length_encoding(this);
4151 __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4152 } else {
4153 assert(vlen == 4, "sanity");
4154 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4155 __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4156 }
4157 %}
4158 ins_pipe( pipe_slow );
4159 %}
4160
4161 instruct ReplD_mem(vec dst, memory mem) %{
4162 match(Set dst (ReplicateD (LoadD mem)));
4163 format %{ "replicateD $dst,$mem" %}
4164 ins_encode %{
4165 uint vlen = vector_length(this);
4166 if (vlen == 2) {
4167 __ movq($dst$$XMMRegister, $mem$$Address);
4168 __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4169 } else {
4170 assert(VM_Version::supports_avx(), "sanity");
4171 int vlen_enc = vector_length_encoding(this);
4172 __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4173 }
4174 %}
4175 ins_pipe( pipe_slow );
4176 %}
4177
4178 instruct ReplD_zero(vec dst, immD0 zero) %{
4179 match(Set dst (ReplicateD zero));
4180 format %{ "replicateD $dst,$zero" %}
4181 ins_encode %{
4182 uint vlen = vector_length(this);
4183 if (vlen == 2) {
4184 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4185 } else {
4186 int vlen_enc = vector_length_encoding(this);
4187 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4188 }
4189 %}
4190 ins_pipe( fpu_reg_reg );
4191 %}
4192
4193 // ====================VECTOR INSERT=======================================
4194
4195 instruct insert(vec dst, rRegI val, immU8 idx) %{
4196 predicate(vector_length_in_bytes(n) < 32);
4197 match(Set dst (VectorInsert (Binary dst val) idx));
4198 format %{ "vector_insert $dst,$val,$idx" %}
4199 ins_encode %{
4200 assert(UseSSE >= 4, "required");
4201 assert(vector_length_in_bytes(this) >= 8, "required");
4202
4203 BasicType elem_bt = vector_element_basic_type(this);
4204
4205 assert(is_integral_type(elem_bt), "");
4206 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4207
4208 __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4209 %}
4210 ins_pipe( pipe_slow );
4211 %}
4212
4213 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4214 predicate(vector_length_in_bytes(n) == 32);
4215 match(Set dst (VectorInsert (Binary src val) idx));
4216 effect(TEMP vtmp);
4217 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4218 ins_encode %{
4219 int vlen_enc = Assembler::AVX_256bit;
4220 BasicType elem_bt = vector_element_basic_type(this);
4221 int elem_per_lane = 16/type2aelembytes(elem_bt);
4222 int log2epr = log2(elem_per_lane);
4223
4224 assert(is_integral_type(elem_bt), "sanity");
4225 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4226
4227 uint x_idx = $idx$$constant & right_n_bits(log2epr);
4228 uint y_idx = ($idx$$constant >> log2epr) & 1;
4229 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4230 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4231 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4232 %}
4233 ins_pipe( pipe_slow );
4234 %}
4235
4236 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4237 predicate(vector_length_in_bytes(n) == 64);
4238 match(Set dst (VectorInsert (Binary src val) idx));
4239 effect(TEMP vtmp);
4240 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4241 ins_encode %{
4242 assert(UseAVX > 2, "sanity");
4243
4244 BasicType elem_bt = vector_element_basic_type(this);
4245 int elem_per_lane = 16/type2aelembytes(elem_bt);
4246 int log2epr = log2(elem_per_lane);
4247
4248 assert(is_integral_type(elem_bt), "");
4249 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4250
4251 uint x_idx = $idx$$constant & right_n_bits(log2epr);
4252 uint y_idx = ($idx$$constant >> log2epr) & 3;
4253 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4254 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4255 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4256 %}
4257 ins_pipe( pipe_slow );
4258 %}
4259
4260 #ifdef _LP64
4261 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4262 predicate(vector_length(n) == 2);
4263 match(Set dst (VectorInsert (Binary dst val) idx));
4264 format %{ "vector_insert $dst,$val,$idx" %}
4265 ins_encode %{
4266 assert(UseSSE >= 4, "required");
4267 assert(vector_element_basic_type(this) == T_LONG, "");
4268 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4269
4270 __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4271 %}
4272 ins_pipe( pipe_slow );
4273 %}
4274
4275 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4276 predicate(vector_length(n) == 4);
4277 match(Set dst (VectorInsert (Binary src val) idx));
4278 effect(TEMP vtmp);
4279 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4280 ins_encode %{
4281 assert(vector_element_basic_type(this) == T_LONG, "");
4282 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4283
4284 uint x_idx = $idx$$constant & right_n_bits(1);
4285 uint y_idx = ($idx$$constant >> 1) & 1;
4286 int vlen_enc = Assembler::AVX_256bit;
4287 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4288 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4289 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4290 %}
4291 ins_pipe( pipe_slow );
4292 %}
4293
4294 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4295 predicate(vector_length(n) == 8);
4296 match(Set dst (VectorInsert (Binary src val) idx));
4297 effect(TEMP vtmp);
4298 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4299 ins_encode %{
4300 assert(vector_element_basic_type(this) == T_LONG, "sanity");
4301 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4302
4303 uint x_idx = $idx$$constant & right_n_bits(1);
4304 uint y_idx = ($idx$$constant >> 1) & 3;
4305 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4306 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4307 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4308 %}
4309 ins_pipe( pipe_slow );
4310 %}
4311 #endif
4312
4313 instruct insertF(vec dst, regF val, immU8 idx) %{
4314 predicate(vector_length(n) < 8);
4315 match(Set dst (VectorInsert (Binary dst val) idx));
4316 format %{ "vector_insert $dst,$val,$idx" %}
4317 ins_encode %{
4318 assert(UseSSE >= 4, "sanity");
4319
4320 assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4321 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4322
4323 __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4324 %}
4325 ins_pipe( pipe_slow );
4326 %}
4327
4328 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4329 predicate(vector_length(n) >= 8);
4330 match(Set dst (VectorInsert (Binary src val) idx));
4331 effect(TEMP vtmp);
4332 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4333 ins_encode %{
4334 assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4335 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4336
4337 int vlen = vector_length(this);
4338 uint x_idx = $idx$$constant & right_n_bits(2);
4339 if (vlen == 8) {
4340 uint y_idx = ($idx$$constant >> 2) & 1;
4341 int vlen_enc = Assembler::AVX_256bit;
4342 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4343 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4344 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4345 } else {
4346 assert(vlen == 16, "sanity");
4347 uint y_idx = ($idx$$constant >> 2) & 3;
4348 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4349 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4350 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4351 }
4352 %}
4353 ins_pipe( pipe_slow );
4354 %}
4355
4356 #ifdef _LP64
4357 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4358 predicate(vector_length(n) == 2);
4359 match(Set dst (VectorInsert (Binary dst val) idx));
4360 effect(TEMP tmp);
4361 format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4362 ins_encode %{
4363 assert(UseSSE >= 4, "sanity");
4364 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4365 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4366
4367 __ movq($tmp$$Register, $val$$XMMRegister);
4368 __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4369 %}
4370 ins_pipe( pipe_slow );
4371 %}
4372
4373 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4374 predicate(vector_length(n) == 4);
4375 match(Set dst (VectorInsert (Binary src val) idx));
4376 effect(TEMP vtmp, TEMP tmp);
4377 format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4378 ins_encode %{
4379 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4380 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4381
4382 uint x_idx = $idx$$constant & right_n_bits(1);
4383 uint y_idx = ($idx$$constant >> 1) & 1;
4384 int vlen_enc = Assembler::AVX_256bit;
4385 __ movq($tmp$$Register, $val$$XMMRegister);
4386 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4387 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4388 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4389 %}
4390 ins_pipe( pipe_slow );
4391 %}
4392
4393 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4394 predicate(vector_length(n) == 8);
4395 match(Set dst (VectorInsert (Binary src val) idx));
4396 effect(TEMP tmp, TEMP vtmp);
4397 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4398 ins_encode %{
4399 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4400 assert($idx$$constant < (int)vector_length(this), "out of bounds");
4401
4402 uint x_idx = $idx$$constant & right_n_bits(1);
4403 uint y_idx = ($idx$$constant >> 1) & 3;
4404 __ movq($tmp$$Register, $val$$XMMRegister);
4405 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4406 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4407 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4408 %}
4409 ins_pipe( pipe_slow );
4410 %}
4411 #endif
4412
4413 // ====================REDUCTION ARITHMETIC=======================================
4414
4415 // =======================Int Reduction==========================================
4416
4417 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4418 predicate(vector_element_basic_type(n->in(2)) == T_INT &&
4419 vector_length(n->in(2)) < 16); // src2
4420 match(Set dst (AddReductionVI src1 src2));
4421 match(Set dst (MulReductionVI src1 src2));
4422 match(Set dst (AndReductionV src1 src2));
4423 match(Set dst ( OrReductionV src1 src2));
4424 match(Set dst (XorReductionV src1 src2));
4425 match(Set dst (MinReductionV src1 src2));
4426 match(Set dst (MaxReductionV src1 src2));
4427 effect(TEMP vtmp1, TEMP vtmp2);
4428 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4429 ins_encode %{
4430 int opcode = this->ideal_Opcode();
4431 int vlen = vector_length(this, $src2);
4432 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4433 %}
4434 ins_pipe( pipe_slow );
4435 %}
4436
4437 instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4438 predicate(vector_element_basic_type(n->in(2)) == T_INT &&
4439 vector_length(n->in(2)) == 16); // src2
4440 match(Set dst (AddReductionVI src1 src2));
4441 match(Set dst (MulReductionVI src1 src2));
4442 match(Set dst (AndReductionV src1 src2));
4443 match(Set dst ( OrReductionV src1 src2));
4444 match(Set dst (XorReductionV src1 src2));
4445 match(Set dst (MinReductionV src1 src2));
4446 match(Set dst (MaxReductionV src1 src2));
4447 effect(TEMP vtmp1, TEMP vtmp2);
4448 format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4449 ins_encode %{
4450 int opcode = this->ideal_Opcode();
4451 int vlen = vector_length(this, $src2);
4452 __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4453 %}
4454 ins_pipe( pipe_slow );
4455 %}
4456
4457 // =======================Long Reduction==========================================
4458
4459 #ifdef _LP64
4460 instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4461 predicate(vector_element_basic_type(n->in(2)) == T_LONG &&
4462 vector_length(n->in(2)) < 8); // src2
4463 match(Set dst (AddReductionVL src1 src2));
4464 match(Set dst (MulReductionVL src1 src2));
4465 match(Set dst (AndReductionV src1 src2));
4466 match(Set dst ( OrReductionV src1 src2));
4467 match(Set dst (XorReductionV src1 src2));
4468 match(Set dst (MinReductionV src1 src2));
4469 match(Set dst (MaxReductionV src1 src2));
4470 effect(TEMP vtmp1, TEMP vtmp2);
4471 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4472 ins_encode %{
4473 int opcode = this->ideal_Opcode();
4474 int vlen = vector_length(this, $src2);
4475 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4476 %}
4477 ins_pipe( pipe_slow );
4478 %}
4479
4480 instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4481 predicate(vector_element_basic_type(n->in(2)) == T_LONG &&
4482 vector_length(n->in(2)) == 8); // src2
4483 match(Set dst (AddReductionVL src1 src2));
4484 match(Set dst (MulReductionVL src1 src2));
4485 match(Set dst (AndReductionV src1 src2));
4486 match(Set dst ( OrReductionV src1 src2));
4487 match(Set dst (XorReductionV src1 src2));
4488 match(Set dst (MinReductionV src1 src2));
4489 match(Set dst (MaxReductionV src1 src2));
4490 effect(TEMP vtmp1, TEMP vtmp2);
4491 format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4492 ins_encode %{
4493 int opcode = this->ideal_Opcode();
4494 int vlen = vector_length(this, $src2);
4495 __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4496 %}
4497 ins_pipe( pipe_slow );
4498 %}
4499 #endif // _LP64
4500
4501 // =======================Float Reduction==========================================
4502
4503 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4504 predicate(vector_length(n->in(2)) <= 4); // src
4505 match(Set dst (AddReductionVF dst src));
4506 match(Set dst (MulReductionVF dst src));
4507 effect(TEMP dst, TEMP vtmp);
4508 format %{ "vector_reduction_float $dst,$src ; using $vtmp as TEMP" %}
4509 ins_encode %{
4510 int opcode = this->ideal_Opcode();
4511 int vlen = vector_length(this, $src);
4512 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4513 %}
4514 ins_pipe( pipe_slow );
4515 %}
4516
4517 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4518 predicate(vector_length(n->in(2)) == 8); // src
4519 match(Set dst (AddReductionVF dst src));
4520 match(Set dst (MulReductionVF dst src));
4521 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4522 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4523 ins_encode %{
4524 int opcode = this->ideal_Opcode();
4525 int vlen = vector_length(this, $src);
4526 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4527 %}
4528 ins_pipe( pipe_slow );
4529 %}
4530
4531 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4532 predicate(vector_length(n->in(2)) == 16); // src
4533 match(Set dst (AddReductionVF dst src));
4534 match(Set dst (MulReductionVF dst src));
4535 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4536 format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4537 ins_encode %{
4538 int opcode = this->ideal_Opcode();
4539 int vlen = vector_length(this, $src);
4540 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4541 %}
4542 ins_pipe( pipe_slow );
4543 %}
4544
4545 // =======================Double Reduction==========================================
4546
4547 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4548 predicate(vector_length(n->in(2)) == 2); // src
4549 match(Set dst (AddReductionVD dst src));
4550 match(Set dst (MulReductionVD dst src));
4551 effect(TEMP dst, TEMP vtmp);
4552 format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4553 ins_encode %{
4554 int opcode = this->ideal_Opcode();
4555 int vlen = vector_length(this, $src);
4556 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4557 %}
4558 ins_pipe( pipe_slow );
4559 %}
4560
4561 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4562 predicate(vector_length(n->in(2)) == 4); // src
4563 match(Set dst (AddReductionVD dst src));
4564 match(Set dst (MulReductionVD dst src));
4565 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4566 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4567 ins_encode %{
4568 int opcode = this->ideal_Opcode();
4569 int vlen = vector_length(this, $src);
4570 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4571 %}
4572 ins_pipe( pipe_slow );
4573 %}
4574
4575 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4576 predicate(vector_length(n->in(2)) == 8); // src
4577 match(Set dst (AddReductionVD dst src));
4578 match(Set dst (MulReductionVD dst src));
4579 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4580 format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4581 ins_encode %{
4582 int opcode = this->ideal_Opcode();
4583 int vlen = vector_length(this, $src);
4584 __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4585 %}
4586 ins_pipe( pipe_slow );
4587 %}
4588
4589 // =======================Byte Reduction==========================================
4590
4591 #ifdef _LP64
4592 instruct reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4593 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4594 vector_length(n->in(2)) <= 32); // src2
4595 match(Set dst (AddReductionVI src1 src2));
4596 match(Set dst (AndReductionV src1 src2));
4597 match(Set dst ( OrReductionV src1 src2));
4598 match(Set dst (XorReductionV src1 src2));
4599 match(Set dst (MinReductionV src1 src2));
4600 match(Set dst (MaxReductionV src1 src2));
4601 effect(TEMP vtmp1, TEMP vtmp2);
4602 format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4603 ins_encode %{
4604 int opcode = this->ideal_Opcode();
4605 int vlen = vector_length(this, $src2);
4606 __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4607 %}
4608 ins_pipe( pipe_slow );
4609 %}
4610
4611 instruct reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4612 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4613 vector_length(n->in(2)) == 64); // src2
4614 match(Set dst (AddReductionVI src1 src2));
4615 match(Set dst (AndReductionV src1 src2));
4616 match(Set dst ( OrReductionV src1 src2));
4617 match(Set dst (XorReductionV src1 src2));
4618 match(Set dst (MinReductionV src1 src2));
4619 match(Set dst (MaxReductionV src1 src2));
4620 effect(TEMP vtmp1, TEMP vtmp2);
4621 format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4622 ins_encode %{
4623 int opcode = this->ideal_Opcode();
4624 int vlen = vector_length(this, $src2);
4625 __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4626 %}
4627 ins_pipe( pipe_slow );
4628 %}
4629 #endif
4630
4631 // =======================Short Reduction==========================================
4632
4633 instruct reductionS(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4634 predicate(vector_element_basic_type(n->in(2)) == T_SHORT &&
4635 vector_length(n->in(2)) <= 16); // src2
4636 match(Set dst (AddReductionVI src1 src2));
4637 match(Set dst (MulReductionVI src1 src2));
4638 match(Set dst (AndReductionV src1 src2));
4639 match(Set dst ( OrReductionV src1 src2));
4640 match(Set dst (XorReductionV src1 src2));
4641 match(Set dst (MinReductionV src1 src2));
4642 match(Set dst (MaxReductionV src1 src2));
4643 effect(TEMP vtmp1, TEMP vtmp2);
4644 format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4645 ins_encode %{
4646 int opcode = this->ideal_Opcode();
4647 int vlen = vector_length(this, $src2);
4648 __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4649 %}
4650 ins_pipe( pipe_slow );
4651 %}
4652
4653 instruct reduction32S(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4654 predicate(vector_element_basic_type(n->in(2)) == T_SHORT &&
4655 vector_length(n->in(2)) == 32); // src2
4656 match(Set dst (AddReductionVI src1 src2));
4657 match(Set dst (MulReductionVI src1 src2));
4658 match(Set dst (AndReductionV src1 src2));
4659 match(Set dst ( OrReductionV src1 src2));
4660 match(Set dst (XorReductionV src1 src2));
4661 match(Set dst (MinReductionV src1 src2));
4662 match(Set dst (MaxReductionV src1 src2));
4663 effect(TEMP vtmp1, TEMP vtmp2);
4664 format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4665 ins_encode %{
4666 int opcode = this->ideal_Opcode();
4667 int vlen = vector_length(this, $src2);
4668 __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4669 %}
4670 ins_pipe( pipe_slow );
4671 %}
4672
4673 // =======================Mul Reduction==========================================
4674
4675 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4676 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4677 vector_length(n->in(2)) <= 32); // src2
4678 match(Set dst (MulReductionVI src1 src2));
4679 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4680 format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4681 ins_encode %{
4682 int opcode = this->ideal_Opcode();
4683 int vlen = vector_length(this, $src2);
4684 __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4685 %}
4686 ins_pipe( pipe_slow );
4687 %}
4688
4689 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4690 predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4691 vector_length(n->in(2)) == 64); // src2
4692 match(Set dst (MulReductionVI src1 src2));
4693 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4694 format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4695 ins_encode %{
4696 int opcode = this->ideal_Opcode();
4697 int vlen = vector_length(this, $src2);
4698 __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4699 %}
4700 ins_pipe( pipe_slow );
4701 %}
4702
4703 //--------------------Min/Max Float Reduction --------------------
4704 // Float Min Reduction
4705 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4706 legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4707 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4708 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4709 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4710 vector_length(n->in(2)) == 2);
4711 match(Set dst (MinReductionV src1 src2));
4712 match(Set dst (MaxReductionV src1 src2));
4713 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4714 format %{ "vector_minmax2F_reduction $dst,$src1,$src2 ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4715 ins_encode %{
4716 assert(UseAVX > 0, "sanity");
4717
4718 int opcode = this->ideal_Opcode();
4719 int vlen = vector_length(this, $src2);
4720 __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4721 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4722 %}
4723 ins_pipe( pipe_slow );
4724 %}
4725
4726 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4727 legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4728 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4729 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4730 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4731 vector_length(n->in(2)) >= 4);
4732 match(Set dst (MinReductionV src1 src2));
4733 match(Set dst (MaxReductionV src1 src2));
4734 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4735 format %{ "vector_minmaxF_reduction $dst,$src1,$src2 ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4736 ins_encode %{
4737 assert(UseAVX > 0, "sanity");
4738
4739 int opcode = this->ideal_Opcode();
4740 int vlen = vector_length(this, $src2);
4741 __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4742 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4743 %}
4744 ins_pipe( pipe_slow );
4745 %}
4746
4747 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4748 legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4749 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4750 vector_length(n->in(2)) == 2);
4751 match(Set dst (MinReductionV dst src));
4752 match(Set dst (MaxReductionV dst src));
4753 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4754 format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4755 ins_encode %{
4756 assert(UseAVX > 0, "sanity");
4757
4758 int opcode = this->ideal_Opcode();
4759 int vlen = vector_length(this, $src);
4760 __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4761 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4762 %}
4763 ins_pipe( pipe_slow );
4764 %}
4765
4766
4767 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4768 legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4769 predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4770 vector_length(n->in(2)) >= 4);
4771 match(Set dst (MinReductionV dst src));
4772 match(Set dst (MaxReductionV dst src));
4773 effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4774 format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4775 ins_encode %{
4776 assert(UseAVX > 0, "sanity");
4777
4778 int opcode = this->ideal_Opcode();
4779 int vlen = vector_length(this, $src);
4780 __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4781 $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4782 %}
4783 ins_pipe( pipe_slow );
4784 %}
4785
4786
4787 //--------------------Min Double Reduction --------------------
4788 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4789 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4790 rFlagsReg cr) %{
4791 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4792 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4793 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4794 vector_length(n->in(2)) == 2);
4795 match(Set dst (MinReductionV src1 src2));
4796 match(Set dst (MaxReductionV src1 src2));
4797 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4798 format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4799 ins_encode %{
4800 assert(UseAVX > 0, "sanity");
4801
4802 int opcode = this->ideal_Opcode();
4803 int vlen = vector_length(this, $src2);
4804 __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4805 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4806 %}
4807 ins_pipe( pipe_slow );
4808 %}
4809
4810 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
4811 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4812 rFlagsReg cr) %{
4813 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4814 ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4815 (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4816 vector_length(n->in(2)) >= 4);
4817 match(Set dst (MinReductionV src1 src2));
4818 match(Set dst (MaxReductionV src1 src2));
4819 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4820 format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4821 ins_encode %{
4822 assert(UseAVX > 0, "sanity");
4823
4824 int opcode = this->ideal_Opcode();
4825 int vlen = vector_length(this, $src2);
4826 __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4827 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4828 %}
4829 ins_pipe( pipe_slow );
4830 %}
4831
4832
4833 instruct minmax_reduction2D_av(legRegD dst, legVec src,
4834 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4835 rFlagsReg cr) %{
4836 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4837 vector_length(n->in(2)) == 2);
4838 match(Set dst (MinReductionV dst src));
4839 match(Set dst (MaxReductionV dst src));
4840 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4841 format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4842 ins_encode %{
4843 assert(UseAVX > 0, "sanity");
4844
4845 int opcode = this->ideal_Opcode();
4846 int vlen = vector_length(this, $src);
4847 __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4848 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4849 %}
4850 ins_pipe( pipe_slow );
4851 %}
4852
4853 instruct minmax_reductionD_av(legRegD dst, legVec src,
4854 legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4855 rFlagsReg cr) %{
4856 predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4857 vector_length(n->in(2)) >= 4);
4858 match(Set dst (MinReductionV dst src));
4859 match(Set dst (MaxReductionV dst src));
4860 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4861 format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4862 ins_encode %{
4863 assert(UseAVX > 0, "sanity");
4864
4865 int opcode = this->ideal_Opcode();
4866 int vlen = vector_length(this, $src);
4867 __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4868 $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4869 %}
4870 ins_pipe( pipe_slow );
4871 %}
4872
4873 // ====================VECTOR ARITHMETIC=======================================
4874
4875 // --------------------------------- ADD --------------------------------------
4876
4877 // Bytes vector add
4878 instruct vaddB(vec dst, vec src) %{
4879 predicate(UseAVX == 0);
4880 match(Set dst (AddVB dst src));
4881 format %{ "paddb $dst,$src\t! add packedB" %}
4882 ins_encode %{
4883 __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4884 %}
4885 ins_pipe( pipe_slow );
4886 %}
4887
4888 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
4889 predicate(UseAVX > 0);
4890 match(Set dst (AddVB src1 src2));
4891 format %{ "vpaddb $dst,$src1,$src2\t! add packedB" %}
4892 ins_encode %{
4893 int vlen_enc = vector_length_encoding(this);
4894 __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4895 %}
4896 ins_pipe( pipe_slow );
4897 %}
4898
4899 instruct vaddB_mem(vec dst, vec src, memory mem) %{
4900 predicate(UseAVX > 0);
4901 match(Set dst (AddVB src (LoadVector mem)));
4902 format %{ "vpaddb $dst,$src,$mem\t! add packedB" %}
4903 ins_encode %{
4904 int vlen_enc = vector_length_encoding(this);
4905 __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4906 %}
4907 ins_pipe( pipe_slow );
4908 %}
4909
4910 // Shorts/Chars vector add
4911 instruct vaddS(vec dst, vec src) %{
4912 predicate(UseAVX == 0);
4913 match(Set dst (AddVS dst src));
4914 format %{ "paddw $dst,$src\t! add packedS" %}
4915 ins_encode %{
4916 __ paddw($dst$$XMMRegister, $src$$XMMRegister);
4917 %}
4918 ins_pipe( pipe_slow );
4919 %}
4920
4921 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
4922 predicate(UseAVX > 0);
4923 match(Set dst (AddVS src1 src2));
4924 format %{ "vpaddw $dst,$src1,$src2\t! add packedS" %}
4925 ins_encode %{
4926 int vlen_enc = vector_length_encoding(this);
4927 __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4928 %}
4929 ins_pipe( pipe_slow );
4930 %}
4931
4932 instruct vaddS_mem(vec dst, vec src, memory mem) %{
4933 predicate(UseAVX > 0);
4934 match(Set dst (AddVS src (LoadVector mem)));
4935 format %{ "vpaddw $dst,$src,$mem\t! add packedS" %}
4936 ins_encode %{
4937 int vlen_enc = vector_length_encoding(this);
4938 __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4939 %}
4940 ins_pipe( pipe_slow );
4941 %}
4942
4943 // Integers vector add
4944 instruct vaddI(vec dst, vec src) %{
4945 predicate(UseAVX == 0);
4946 match(Set dst (AddVI dst src));
4947 format %{ "paddd $dst,$src\t! add packedI" %}
4948 ins_encode %{
4949 __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4950 %}
4951 ins_pipe( pipe_slow );
4952 %}
4953
4954 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4955 predicate(UseAVX > 0);
4956 match(Set dst (AddVI src1 src2));
4957 format %{ "vpaddd $dst,$src1,$src2\t! add packedI" %}
4958 ins_encode %{
4959 int vlen_enc = vector_length_encoding(this);
4960 __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4961 %}
4962 ins_pipe( pipe_slow );
4963 %}
4964
4965
4966 instruct vaddI_mem(vec dst, vec src, memory mem) %{
4967 predicate(UseAVX > 0);
4968 match(Set dst (AddVI src (LoadVector mem)));
4969 format %{ "vpaddd $dst,$src,$mem\t! add packedI" %}
4970 ins_encode %{
4971 int vlen_enc = vector_length_encoding(this);
4972 __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4973 %}
4974 ins_pipe( pipe_slow );
4975 %}
4976
4977 // Longs vector add
4978 instruct vaddL(vec dst, vec src) %{
4979 predicate(UseAVX == 0);
4980 match(Set dst (AddVL dst src));
4981 format %{ "paddq $dst,$src\t! add packedL" %}
4982 ins_encode %{
4983 __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4984 %}
4985 ins_pipe( pipe_slow );
4986 %}
4987
4988 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4989 predicate(UseAVX > 0);
4990 match(Set dst (AddVL src1 src2));
4991 format %{ "vpaddq $dst,$src1,$src2\t! add packedL" %}
4992 ins_encode %{
4993 int vlen_enc = vector_length_encoding(this);
4994 __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4995 %}
4996 ins_pipe( pipe_slow );
4997 %}
4998
4999 instruct vaddL_mem(vec dst, vec src, memory mem) %{
5000 predicate(UseAVX > 0);
5001 match(Set dst (AddVL src (LoadVector mem)));
5002 format %{ "vpaddq $dst,$src,$mem\t! add packedL" %}
5003 ins_encode %{
5004 int vlen_enc = vector_length_encoding(this);
5005 __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5006 %}
5007 ins_pipe( pipe_slow );
5008 %}
5009
5010 // Floats vector add
5011 instruct vaddF(vec dst, vec src) %{
5012 predicate(UseAVX == 0);
5013 match(Set dst (AddVF dst src));
5014 format %{ "addps $dst,$src\t! add packedF" %}
5015 ins_encode %{
5016 __ addps($dst$$XMMRegister, $src$$XMMRegister);
5017 %}
5018 ins_pipe( pipe_slow );
5019 %}
5020
5021 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5022 predicate(UseAVX > 0);
5023 match(Set dst (AddVF src1 src2));
5024 format %{ "vaddps $dst,$src1,$src2\t! add packedF" %}
5025 ins_encode %{
5026 int vlen_enc = vector_length_encoding(this);
5027 __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5028 %}
5029 ins_pipe( pipe_slow );
5030 %}
5031
5032 instruct vaddF_mem(vec dst, vec src, memory mem) %{
5033 predicate(UseAVX > 0);
5034 match(Set dst (AddVF src (LoadVector mem)));
5035 format %{ "vaddps $dst,$src,$mem\t! add packedF" %}
5036 ins_encode %{
5037 int vlen_enc = vector_length_encoding(this);
5038 __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5039 %}
5040 ins_pipe( pipe_slow );
5041 %}
5042
5043 // Doubles vector add
5044 instruct vaddD(vec dst, vec src) %{
5045 predicate(UseAVX == 0);
5046 match(Set dst (AddVD dst src));
5047 format %{ "addpd $dst,$src\t! add packedD" %}
5048 ins_encode %{
5049 __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5050 %}
5051 ins_pipe( pipe_slow );
5052 %}
5053
5054 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5055 predicate(UseAVX > 0);
5056 match(Set dst (AddVD src1 src2));
5057 format %{ "vaddpd $dst,$src1,$src2\t! add packedD" %}
5058 ins_encode %{
5059 int vlen_enc = vector_length_encoding(this);
5060 __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5061 %}
5062 ins_pipe( pipe_slow );
5063 %}
5064
5065 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5066 predicate(UseAVX > 0);
5067 match(Set dst (AddVD src (LoadVector mem)));
5068 format %{ "vaddpd $dst,$src,$mem\t! add packedD" %}
5069 ins_encode %{
5070 int vlen_enc = vector_length_encoding(this);
5071 __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5072 %}
5073 ins_pipe( pipe_slow );
5074 %}
5075
5076 // --------------------------------- SUB --------------------------------------
5077
5078 // Bytes vector sub
5079 instruct vsubB(vec dst, vec src) %{
5080 predicate(UseAVX == 0);
5081 match(Set dst (SubVB dst src));
5082 format %{ "psubb $dst,$src\t! sub packedB" %}
5083 ins_encode %{
5084 __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5085 %}
5086 ins_pipe( pipe_slow );
5087 %}
5088
5089 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5090 predicate(UseAVX > 0);
5091 match(Set dst (SubVB src1 src2));
5092 format %{ "vpsubb $dst,$src1,$src2\t! sub packedB" %}
5093 ins_encode %{
5094 int vlen_enc = vector_length_encoding(this);
5095 __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5096 %}
5097 ins_pipe( pipe_slow );
5098 %}
5099
5100 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5101 predicate(UseAVX > 0);
5102 match(Set dst (SubVB src (LoadVector mem)));
5103 format %{ "vpsubb $dst,$src,$mem\t! sub packedB" %}
5104 ins_encode %{
5105 int vlen_enc = vector_length_encoding(this);
5106 __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5107 %}
5108 ins_pipe( pipe_slow );
5109 %}
5110
5111 // Shorts/Chars vector sub
5112 instruct vsubS(vec dst, vec src) %{
5113 predicate(UseAVX == 0);
5114 match(Set dst (SubVS dst src));
5115 format %{ "psubw $dst,$src\t! sub packedS" %}
5116 ins_encode %{
5117 __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5118 %}
5119 ins_pipe( pipe_slow );
5120 %}
5121
5122
5123 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5124 predicate(UseAVX > 0);
5125 match(Set dst (SubVS src1 src2));
5126 format %{ "vpsubw $dst,$src1,$src2\t! sub packedS" %}
5127 ins_encode %{
5128 int vlen_enc = vector_length_encoding(this);
5129 __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5130 %}
5131 ins_pipe( pipe_slow );
5132 %}
5133
5134 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5135 predicate(UseAVX > 0);
5136 match(Set dst (SubVS src (LoadVector mem)));
5137 format %{ "vpsubw $dst,$src,$mem\t! sub packedS" %}
5138 ins_encode %{
5139 int vlen_enc = vector_length_encoding(this);
5140 __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5141 %}
5142 ins_pipe( pipe_slow );
5143 %}
5144
5145 // Integers vector sub
5146 instruct vsubI(vec dst, vec src) %{
5147 predicate(UseAVX == 0);
5148 match(Set dst (SubVI dst src));
5149 format %{ "psubd $dst,$src\t! sub packedI" %}
5150 ins_encode %{
5151 __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5152 %}
5153 ins_pipe( pipe_slow );
5154 %}
5155
5156 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5157 predicate(UseAVX > 0);
5158 match(Set dst (SubVI src1 src2));
5159 format %{ "vpsubd $dst,$src1,$src2\t! sub packedI" %}
5160 ins_encode %{
5161 int vlen_enc = vector_length_encoding(this);
5162 __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5163 %}
5164 ins_pipe( pipe_slow );
5165 %}
5166
5167 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5168 predicate(UseAVX > 0);
5169 match(Set dst (SubVI src (LoadVector mem)));
5170 format %{ "vpsubd $dst,$src,$mem\t! sub packedI" %}
5171 ins_encode %{
5172 int vlen_enc = vector_length_encoding(this);
5173 __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5174 %}
5175 ins_pipe( pipe_slow );
5176 %}
5177
5178 // Longs vector sub
5179 instruct vsubL(vec dst, vec src) %{
5180 predicate(UseAVX == 0);
5181 match(Set dst (SubVL dst src));
5182 format %{ "psubq $dst,$src\t! sub packedL" %}
5183 ins_encode %{
5184 __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5185 %}
5186 ins_pipe( pipe_slow );
5187 %}
5188
5189 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5190 predicate(UseAVX > 0);
5191 match(Set dst (SubVL src1 src2));
5192 format %{ "vpsubq $dst,$src1,$src2\t! sub packedL" %}
5193 ins_encode %{
5194 int vlen_enc = vector_length_encoding(this);
5195 __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5196 %}
5197 ins_pipe( pipe_slow );
5198 %}
5199
5200
5201 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5202 predicate(UseAVX > 0);
5203 match(Set dst (SubVL src (LoadVector mem)));
5204 format %{ "vpsubq $dst,$src,$mem\t! sub packedL" %}
5205 ins_encode %{
5206 int vlen_enc = vector_length_encoding(this);
5207 __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5208 %}
5209 ins_pipe( pipe_slow );
5210 %}
5211
5212 // Floats vector sub
5213 instruct vsubF(vec dst, vec src) %{
5214 predicate(UseAVX == 0);
5215 match(Set dst (SubVF dst src));
5216 format %{ "subps $dst,$src\t! sub packedF" %}
5217 ins_encode %{
5218 __ subps($dst$$XMMRegister, $src$$XMMRegister);
5219 %}
5220 ins_pipe( pipe_slow );
5221 %}
5222
5223 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5224 predicate(UseAVX > 0);
5225 match(Set dst (SubVF src1 src2));
5226 format %{ "vsubps $dst,$src1,$src2\t! sub packedF" %}
5227 ins_encode %{
5228 int vlen_enc = vector_length_encoding(this);
5229 __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5230 %}
5231 ins_pipe( pipe_slow );
5232 %}
5233
5234 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5235 predicate(UseAVX > 0);
5236 match(Set dst (SubVF src (LoadVector mem)));
5237 format %{ "vsubps $dst,$src,$mem\t! sub packedF" %}
5238 ins_encode %{
5239 int vlen_enc = vector_length_encoding(this);
5240 __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5241 %}
5242 ins_pipe( pipe_slow );
5243 %}
5244
5245 // Doubles vector sub
5246 instruct vsubD(vec dst, vec src) %{
5247 predicate(UseAVX == 0);
5248 match(Set dst (SubVD dst src));
5249 format %{ "subpd $dst,$src\t! sub packedD" %}
5250 ins_encode %{
5251 __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5252 %}
5253 ins_pipe( pipe_slow );
5254 %}
5255
5256 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5257 predicate(UseAVX > 0);
5258 match(Set dst (SubVD src1 src2));
5259 format %{ "vsubpd $dst,$src1,$src2\t! sub packedD" %}
5260 ins_encode %{
5261 int vlen_enc = vector_length_encoding(this);
5262 __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5263 %}
5264 ins_pipe( pipe_slow );
5265 %}
5266
5267 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5268 predicate(UseAVX > 0);
5269 match(Set dst (SubVD src (LoadVector mem)));
5270 format %{ "vsubpd $dst,$src,$mem\t! sub packedD" %}
5271 ins_encode %{
5272 int vlen_enc = vector_length_encoding(this);
5273 __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5274 %}
5275 ins_pipe( pipe_slow );
5276 %}
5277
5278 // --------------------------------- MUL --------------------------------------
5279
5280 // Byte vector mul
5281 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5282 predicate(vector_length(n) == 4 ||
5283 vector_length(n) == 8);
5284 match(Set dst (MulVB src1 src2));
5285 effect(TEMP dst, TEMP tmp, TEMP scratch);
5286 format %{"vector_mulB $dst,$src1,$src2" %}
5287 ins_encode %{
5288 assert(UseSSE > 3, "required");
5289 __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5290 __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5291 __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5292 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5293 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5294 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5295 %}
5296 ins_pipe( pipe_slow );
5297 %}
5298
5299 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5300 predicate(vector_length(n) == 16 && UseAVX <= 1);
5301 match(Set dst (MulVB src1 src2));
5302 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5303 format %{"vector_mulB $dst,$src1,$src2" %}
5304 ins_encode %{
5305 assert(UseSSE > 3, "required");
5306 __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5307 __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5308 __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5309 __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5310 __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5311 __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5312 __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5313 __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5314 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5315 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5316 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5317 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5318 %}
5319 ins_pipe( pipe_slow );
5320 %}
5321
5322 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5323 predicate(vector_length(n) == 16 && UseAVX > 1);
5324 match(Set dst (MulVB src1 src2));
5325 effect(TEMP dst, TEMP tmp, TEMP scratch);
5326 format %{"vector_mulB $dst,$src1,$src2" %}
5327 ins_encode %{
5328 int vlen_enc = Assembler::AVX_256bit;
5329 __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5330 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5331 __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5332 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5333 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5334 __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5335 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5336 %}
5337 ins_pipe( pipe_slow );
5338 %}
5339
5340 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5341 predicate(vector_length(n) == 32);
5342 match(Set dst (MulVB src1 src2));
5343 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5344 format %{"vector_mulB $dst,$src1,$src2" %}
5345 ins_encode %{
5346 assert(UseAVX > 1, "required");
5347 int vlen_enc = Assembler::AVX_256bit;
5348 __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5349 __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5350 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5351 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5352 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5353 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5354 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5355 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5356 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5357 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5358 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5359 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5360 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5361 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5362 %}
5363 ins_pipe( pipe_slow );
5364 %}
5365
5366 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5367 predicate(vector_length(n) == 64);
5368 match(Set dst (MulVB src1 src2));
5369 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5370 format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5371 ins_encode %{
5372 assert(UseAVX > 2, "required");
5373 int vlen_enc = Assembler::AVX_512bit;
5374 __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5375 __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5376 __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5377 __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5378 __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5379 __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5380 __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5381 __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5382 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5383 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5384 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5385 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5386 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5387 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5388 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5389 %}
5390 ins_pipe( pipe_slow );
5391 %}
5392
5393 // Shorts/Chars vector mul
5394 instruct vmulS(vec dst, vec src) %{
5395 predicate(UseAVX == 0);
5396 match(Set dst (MulVS dst src));
5397 format %{ "pmullw $dst,$src\t! mul packedS" %}
5398 ins_encode %{
5399 __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5400 %}
5401 ins_pipe( pipe_slow );
5402 %}
5403
5404 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5405 predicate(UseAVX > 0);
5406 match(Set dst (MulVS src1 src2));
5407 format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5408 ins_encode %{
5409 int vlen_enc = vector_length_encoding(this);
5410 __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5411 %}
5412 ins_pipe( pipe_slow );
5413 %}
5414
5415 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5416 predicate(UseAVX > 0);
5417 match(Set dst (MulVS src (LoadVector mem)));
5418 format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5419 ins_encode %{
5420 int vlen_enc = vector_length_encoding(this);
5421 __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5422 %}
5423 ins_pipe( pipe_slow );
5424 %}
5425
5426 // Integers vector mul
5427 instruct vmulI(vec dst, vec src) %{
5428 predicate(UseAVX == 0);
5429 match(Set dst (MulVI dst src));
5430 format %{ "pmulld $dst,$src\t! mul packedI" %}
5431 ins_encode %{
5432 assert(UseSSE > 3, "required");
5433 __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5434 %}
5435 ins_pipe( pipe_slow );
5436 %}
5437
5438 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5439 predicate(UseAVX > 0);
5440 match(Set dst (MulVI src1 src2));
5441 format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5442 ins_encode %{
5443 int vlen_enc = vector_length_encoding(this);
5444 __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5445 %}
5446 ins_pipe( pipe_slow );
5447 %}
5448
5449 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5450 predicate(UseAVX > 0);
5451 match(Set dst (MulVI src (LoadVector mem)));
5452 format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5453 ins_encode %{
5454 int vlen_enc = vector_length_encoding(this);
5455 __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5456 %}
5457 ins_pipe( pipe_slow );
5458 %}
5459
5460 // Longs vector mul
5461 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5462 predicate(VM_Version::supports_avx512dq());
5463 match(Set dst (MulVL src1 src2));
5464 format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5465 ins_encode %{
5466 assert(UseAVX > 2, "required");
5467 int vlen_enc = vector_length_encoding(this);
5468 __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5469 %}
5470 ins_pipe( pipe_slow );
5471 %}
5472
5473 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5474 predicate(VM_Version::supports_avx512dq());
5475 match(Set dst (MulVL src (LoadVector mem)));
5476 format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5477 ins_encode %{
5478 assert(UseAVX > 2, "required");
5479 int vlen_enc = vector_length_encoding(this);
5480 __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5481 %}
5482 ins_pipe( pipe_slow );
5483 %}
5484
5485 instruct mul2L_reg(vec dst, vec src2, vec tmp) %{
5486 predicate(vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5487 match(Set dst (MulVL dst src2));
5488 effect(TEMP dst, TEMP tmp);
5489 format %{ "pshufd $tmp,$src2, 177\n\t"
5490 "pmulld $tmp,$dst\n\t"
5491 "phaddd $tmp,$tmp\n\t"
5492 "pmovzxdq $tmp,$tmp\n\t"
5493 "psllq $tmp, 32\n\t"
5494 "pmuludq $dst,$src2\n\t"
5495 "paddq $dst,$tmp\n\t! mul packed2L" %}
5496
5497 ins_encode %{
5498 assert(VM_Version::supports_sse4_1(), "required");
5499 int vlen_enc = Assembler::AVX_128bit;
5500 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5501 __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5502 __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5503 __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5504 __ psllq($tmp$$XMMRegister, 32);
5505 __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5506 __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5507 %}
5508 ins_pipe( pipe_slow );
5509 %}
5510
5511 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, vec tmp, vec tmp1) %{
5512 predicate(vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5513 match(Set dst (MulVL src1 src2));
5514 effect(TEMP tmp1, TEMP tmp);
5515 format %{ "vpshufd $tmp,$src2\n\t"
5516 "vpmulld $tmp,$src1,$tmp\n\t"
5517 "vphaddd $tmp,$tmp,$tmp\n\t"
5518 "vpmovzxdq $tmp,$tmp\n\t"
5519 "vpsllq $tmp,$tmp\n\t"
5520 "vpmuludq $tmp1,$src1,$src2\n\t"
5521 "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5522 ins_encode %{
5523 int vlen_enc = Assembler::AVX_256bit;
5524 __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5525 __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5526 __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5527 __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5528 __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5529 __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5530 __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5531 __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5532 %}
5533 ins_pipe( pipe_slow );
5534 %}
5535
5536 // Floats vector mul
5537 instruct vmulF(vec dst, vec src) %{
5538 predicate(UseAVX == 0);
5539 match(Set dst (MulVF dst src));
5540 format %{ "mulps $dst,$src\t! mul packedF" %}
5541 ins_encode %{
5542 __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5543 %}
5544 ins_pipe( pipe_slow );
5545 %}
5546
5547 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5548 predicate(UseAVX > 0);
5549 match(Set dst (MulVF src1 src2));
5550 format %{ "vmulps $dst,$src1,$src2\t! mul packedF" %}
5551 ins_encode %{
5552 int vlen_enc = vector_length_encoding(this);
5553 __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5554 %}
5555 ins_pipe( pipe_slow );
5556 %}
5557
5558 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5559 predicate(UseAVX > 0);
5560 match(Set dst (MulVF src (LoadVector mem)));
5561 format %{ "vmulps $dst,$src,$mem\t! mul packedF" %}
5562 ins_encode %{
5563 int vlen_enc = vector_length_encoding(this);
5564 __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5565 %}
5566 ins_pipe( pipe_slow );
5567 %}
5568
5569 // Doubles vector mul
5570 instruct vmulD(vec dst, vec src) %{
5571 predicate(UseAVX == 0);
5572 match(Set dst (MulVD dst src));
5573 format %{ "mulpd $dst,$src\t! mul packedD" %}
5574 ins_encode %{
5575 __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5576 %}
5577 ins_pipe( pipe_slow );
5578 %}
5579
5580 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5581 predicate(UseAVX > 0);
5582 match(Set dst (MulVD src1 src2));
5583 format %{ "vmulpd $dst,$src1,$src2\t! mul packedD" %}
5584 ins_encode %{
5585 int vlen_enc = vector_length_encoding(this);
5586 __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5587 %}
5588 ins_pipe( pipe_slow );
5589 %}
5590
5591 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5592 predicate(UseAVX > 0);
5593 match(Set dst (MulVD src (LoadVector mem)));
5594 format %{ "vmulpd $dst,$src,$mem\t! mul packedD" %}
5595 ins_encode %{
5596 int vlen_enc = vector_length_encoding(this);
5597 __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5598 %}
5599 ins_pipe( pipe_slow );
5600 %}
5601
5602 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5603 predicate(vector_length(n) == 8);
5604 match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5605 effect(TEMP dst, USE src1, USE src2);
5606 format %{ "cmpps.$copnd $dst, $src1, $src2 ! vcmovevf, cond=$cop\n\t"
5607 "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5608 %}
5609 ins_encode %{
5610 assert(UseAVX > 0, "required");
5611
5612 int vlen_enc = Assembler::AVX_256bit;
5613 int cond = (Assembler::Condition)($copnd$$cmpcode);
5614 __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5615 __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5616 %}
5617 ins_pipe( pipe_slow );
5618 %}
5619
5620 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5621 predicate(vector_length(n) == 4);
5622 match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5623 effect(TEMP dst, USE src1, USE src2);
5624 format %{ "cmppd.$copnd $dst, $src1, $src2 ! vcmovevd, cond=$cop\n\t"
5625 "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5626 %}
5627 ins_encode %{
5628 assert(UseAVX > 0, "required");
5629
5630 int vlen_enc = Assembler::AVX_256bit;
5631 int cond = (Assembler::Condition)($copnd$$cmpcode);
5632 __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5633 __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5634 %}
5635 ins_pipe( pipe_slow );
5636 %}
5637
5638 // --------------------------------- DIV --------------------------------------
5639
5640 // Floats vector div
5641 instruct vdivF(vec dst, vec src) %{
5642 predicate(UseAVX == 0);
5643 match(Set dst (DivVF dst src));
5644 format %{ "divps $dst,$src\t! div packedF" %}
5645 ins_encode %{
5646 __ divps($dst$$XMMRegister, $src$$XMMRegister);
5647 %}
5648 ins_pipe( pipe_slow );
5649 %}
5650
5651 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5652 predicate(UseAVX > 0);
5653 match(Set dst (DivVF src1 src2));
5654 format %{ "vdivps $dst,$src1,$src2\t! div packedF" %}
5655 ins_encode %{
5656 int vlen_enc = vector_length_encoding(this);
5657 __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5658 %}
5659 ins_pipe( pipe_slow );
5660 %}
5661
5662 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5663 predicate(UseAVX > 0);
5664 match(Set dst (DivVF src (LoadVector mem)));
5665 format %{ "vdivps $dst,$src,$mem\t! div packedF" %}
5666 ins_encode %{
5667 int vlen_enc = vector_length_encoding(this);
5668 __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5669 %}
5670 ins_pipe( pipe_slow );
5671 %}
5672
5673 // Doubles vector div
5674 instruct vdivD(vec dst, vec src) %{
5675 predicate(UseAVX == 0);
5676 match(Set dst (DivVD dst src));
5677 format %{ "divpd $dst,$src\t! div packedD" %}
5678 ins_encode %{
5679 __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5680 %}
5681 ins_pipe( pipe_slow );
5682 %}
5683
5684 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5685 predicate(UseAVX > 0);
5686 match(Set dst (DivVD src1 src2));
5687 format %{ "vdivpd $dst,$src1,$src2\t! div packedD" %}
5688 ins_encode %{
5689 int vlen_enc = vector_length_encoding(this);
5690 __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5691 %}
5692 ins_pipe( pipe_slow );
5693 %}
5694
5695 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5696 predicate(UseAVX > 0);
5697 match(Set dst (DivVD src (LoadVector mem)));
5698 format %{ "vdivpd $dst,$src,$mem\t! div packedD" %}
5699 ins_encode %{
5700 int vlen_enc = vector_length_encoding(this);
5701 __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5702 %}
5703 ins_pipe( pipe_slow );
5704 %}
5705
5706 // ------------------------------ MinMax ---------------------------------------
5707
5708 // Byte, Short, Int vector Min/Max
5709 instruct minmax_reg_sse(vec dst, vec src) %{
5710 predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5711 UseAVX == 0);
5712 match(Set dst (MinV dst src));
5713 match(Set dst (MaxV dst src));
5714 format %{ "vector_minmax $dst,$src\t! " %}
5715 ins_encode %{
5716 assert(UseSSE >= 4, "required");
5717
5718 int opcode = this->ideal_Opcode();
5719 BasicType elem_bt = vector_element_basic_type(this);
5720 __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5721 %}
5722 ins_pipe( pipe_slow );
5723 %}
5724
5725 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5726 predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5727 UseAVX > 0);
5728 match(Set dst (MinV src1 src2));
5729 match(Set dst (MaxV src1 src2));
5730 format %{ "vector_minmax $dst,$src1,$src2\t! " %}
5731 ins_encode %{
5732 int opcode = this->ideal_Opcode();
5733 int vlen_enc = vector_length_encoding(this);
5734 BasicType elem_bt = vector_element_basic_type(this);
5735
5736 __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5737 %}
5738 ins_pipe( pipe_slow );
5739 %}
5740
5741 // Long vector Min/Max
5742 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5743 predicate(vector_length_in_bytes(n) == 16 && vector_element_basic_type(n) == T_LONG &&
5744 UseAVX == 0);
5745 match(Set dst (MinV dst src));
5746 match(Set dst (MaxV src dst));
5747 effect(TEMP dst, TEMP tmp);
5748 format %{ "vector_minmaxL $dst,$src\t!using $tmp as TEMP" %}
5749 ins_encode %{
5750 assert(UseSSE >= 4, "required");
5751
5752 int opcode = this->ideal_Opcode();
5753 BasicType elem_bt = vector_element_basic_type(this);
5754 assert(elem_bt == T_LONG, "sanity");
5755
5756 __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5757 %}
5758 ins_pipe( pipe_slow );
5759 %}
5760
5761 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5762 predicate(vector_length_in_bytes(n) <= 32 && vector_element_basic_type(n) == T_LONG &&
5763 UseAVX > 0 && !VM_Version::supports_avx512vl());
5764 match(Set dst (MinV src1 src2));
5765 match(Set dst (MaxV src1 src2));
5766 effect(TEMP dst);
5767 format %{ "vector_minmaxL $dst,$src1,$src2\t! " %}
5768 ins_encode %{
5769 int vlen_enc = vector_length_encoding(this);
5770 int opcode = this->ideal_Opcode();
5771 BasicType elem_bt = vector_element_basic_type(this);
5772 assert(elem_bt == T_LONG, "sanity");
5773
5774 __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5775 %}
5776 ins_pipe( pipe_slow );
5777 %}
5778
5779 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5780 predicate((vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5781 vector_element_basic_type(n) == T_LONG);
5782 match(Set dst (MinV src1 src2));
5783 match(Set dst (MaxV src1 src2));
5784 format %{ "vector_minmaxL $dst,$src1,src2\t! " %}
5785 ins_encode %{
5786 assert(UseAVX > 2, "required");
5787
5788 int vlen_enc = vector_length_encoding(this);
5789 int opcode = this->ideal_Opcode();
5790 BasicType elem_bt = vector_element_basic_type(this);
5791 assert(elem_bt == T_LONG, "sanity");
5792
5793 __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5794 %}
5795 ins_pipe( pipe_slow );
5796 %}
5797
5798 // Float/Double vector Min/Max
5799 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
5800 predicate(vector_length_in_bytes(n) <= 32 &&
5801 is_floating_point_type(vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
5802 UseAVX > 0);
5803 match(Set dst (MinV a b));
5804 match(Set dst (MaxV a b));
5805 effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
5806 format %{ "vector_minmaxFP $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
5807 ins_encode %{
5808 assert(UseAVX > 0, "required");
5809
5810 int opcode = this->ideal_Opcode();
5811 int vlen_enc = vector_length_encoding(this);
5812 BasicType elem_bt = vector_element_basic_type(this);
5813
5814 __ vminmax_fp(opcode, elem_bt,
5815 $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5816 $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5817 %}
5818 ins_pipe( pipe_slow );
5819 %}
5820
5821 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp) %{
5822 predicate(vector_length_in_bytes(n) == 64 &&
5823 is_floating_point_type(vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
5824 match(Set dst (MinV a b));
5825 match(Set dst (MaxV a b));
5826 effect(USE a, USE b, TEMP atmp, TEMP btmp);
5827 format %{ "vector_minmaxFP $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
5828 ins_encode %{
5829 assert(UseAVX > 2, "required");
5830
5831 int opcode = this->ideal_Opcode();
5832 int vlen_enc = vector_length_encoding(this);
5833 BasicType elem_bt = vector_element_basic_type(this);
5834
5835 KRegister ktmp = k1;
5836 __ evminmax_fp(opcode, elem_bt,
5837 $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5838 ktmp, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5839 %}
5840 ins_pipe( pipe_slow );
5841 %}
5842
5843 // --------------------------------- Sqrt --------------------------------------
5844
5845 instruct vsqrtF_reg(vec dst, vec src) %{
5846 match(Set dst (SqrtVF src));
5847 format %{ "vsqrtps $dst,$src\t! sqrt packedF" %}
5848 ins_encode %{
5849 assert(UseAVX > 0, "required");
5850 int vlen_enc = vector_length_encoding(this);
5851 __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5852 %}
5853 ins_pipe( pipe_slow );
5854 %}
5855
5856 instruct vsqrtF_mem(vec dst, memory mem) %{
5857 match(Set dst (SqrtVF (LoadVector mem)));
5858 format %{ "vsqrtps $dst,$mem\t! sqrt packedF" %}
5859 ins_encode %{
5860 assert(UseAVX > 0, "required");
5861 int vlen_enc = vector_length_encoding(this);
5862 __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
5863 %}
5864 ins_pipe( pipe_slow );
5865 %}
5866
5867 // Floating point vector sqrt
5868 instruct vsqrtD_reg(vec dst, vec src) %{
5869 match(Set dst (SqrtVD src));
5870 format %{ "vsqrtpd $dst,$src\t! sqrt packedD" %}
5871 ins_encode %{
5872 assert(UseAVX > 0, "required");
5873 int vlen_enc = vector_length_encoding(this);
5874 __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5875 %}
5876 ins_pipe( pipe_slow );
5877 %}
5878
5879 instruct vsqrtD_mem(vec dst, memory mem) %{
5880 match(Set dst (SqrtVD (LoadVector mem)));
5881 format %{ "vsqrtpd $dst,$mem\t! sqrt packedD" %}
5882 ins_encode %{
5883 assert(UseAVX > 0, "required");
5884 int vlen_enc = vector_length_encoding(this);
5885 __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
5886 %}
5887 ins_pipe( pipe_slow );
5888 %}
5889
5890 // ------------------------------ Shift ---------------------------------------
5891
5892 // Left and right shift count vectors are the same on x86
5893 // (only lowest bits of xmm reg are used for count).
5894 instruct vshiftcnt(vec dst, rRegI cnt) %{
5895 match(Set dst (LShiftCntV cnt));
5896 match(Set dst (RShiftCntV cnt));
5897 format %{ "movdl $dst,$cnt\t! load shift count" %}
5898 ins_encode %{
5899 __ movdl($dst$$XMMRegister, $cnt$$Register);
5900 %}
5901 ins_pipe( pipe_slow );
5902 %}
5903
5904 // Byte vector shift
5905 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5906 predicate(vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
5907 match(Set dst ( LShiftVB src shift));
5908 match(Set dst ( RShiftVB src shift));
5909 match(Set dst (URShiftVB src shift));
5910 effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5911 format %{"vector_byte_shift $dst,$src,$shift" %}
5912 ins_encode %{
5913 assert(UseSSE > 3, "required");
5914 int opcode = this->ideal_Opcode();
5915 bool sign = (opcode != Op_URShiftVB);
5916 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5917 __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5918 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5919 __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5920 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5921 %}
5922 ins_pipe( pipe_slow );
5923 %}
5924
5925 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5926 predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5927 UseAVX <= 1);
5928 match(Set dst ( LShiftVB src shift));
5929 match(Set dst ( RShiftVB src shift));
5930 match(Set dst (URShiftVB src shift));
5931 effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5932 format %{"vector_byte_shift $dst,$src,$shift" %}
5933 ins_encode %{
5934 assert(UseSSE > 3, "required");
5935 int opcode = this->ideal_Opcode();
5936 bool sign = (opcode != Op_URShiftVB);
5937 __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5938 __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5939 __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5940 __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5941 __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5942 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5943 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5944 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5945 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5946 %}
5947 ins_pipe( pipe_slow );
5948 %}
5949
5950 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5951 predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5952 UseAVX > 1);
5953 match(Set dst ( LShiftVB src shift));
5954 match(Set dst ( RShiftVB src shift));
5955 match(Set dst (URShiftVB src shift));
5956 effect(TEMP dst, TEMP tmp, TEMP scratch);
5957 format %{"vector_byte_shift $dst,$src,$shift" %}
5958 ins_encode %{
5959 int opcode = this->ideal_Opcode();
5960 bool sign = (opcode != Op_URShiftVB);
5961 int vlen_enc = Assembler::AVX_256bit;
5962 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5963 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5964 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5965 __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5966 __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5967 %}
5968 ins_pipe( pipe_slow );
5969 %}
5970
5971 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5972 predicate(vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
5973 match(Set dst ( LShiftVB src shift));
5974 match(Set dst ( RShiftVB src shift));
5975 match(Set dst (URShiftVB src shift));
5976 effect(TEMP dst, TEMP tmp, TEMP scratch);
5977 format %{"vector_byte_shift $dst,$src,$shift" %}
5978 ins_encode %{
5979 assert(UseAVX > 1, "required");
5980 int opcode = this->ideal_Opcode();
5981 bool sign = (opcode != Op_URShiftVB);
5982 int vlen_enc = Assembler::AVX_256bit;
5983 __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
5984 __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5985 __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5986 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5987 __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5988 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5989 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5990 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5991 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5992 %}
5993 ins_pipe( pipe_slow );
5994 %}
5995
5996 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5997 predicate(vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
5998 match(Set dst ( LShiftVB src shift));
5999 match(Set dst (RShiftVB src shift));
6000 match(Set dst (URShiftVB src shift));
6001 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6002 format %{"vector_byte_shift $dst,$src,$shift" %}
6003 ins_encode %{
6004 assert(UseAVX > 2, "required");
6005 int opcode = this->ideal_Opcode();
6006 bool sign = (opcode != Op_URShiftVB);
6007 int vlen_enc = Assembler::AVX_512bit;
6008 __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6009 __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6010 __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6011 __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6012 __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6013 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6014 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6015 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6016 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6017 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6018 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6019 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6020 %}
6021 ins_pipe( pipe_slow );
6022 %}
6023
6024 // Shorts vector logical right shift produces incorrect Java result
6025 // for negative data because java code convert short value into int with
6026 // sign extension before a shift. But char vectors are fine since chars are
6027 // unsigned values.
6028 // Shorts/Chars vector left shift
6029 instruct vshiftS(vec dst, vec src, vec shift) %{
6030 predicate(VectorNode::is_vshift_cnt(n->in(2)));
6031 match(Set dst ( LShiftVS src shift));
6032 match(Set dst ( RShiftVS src shift));
6033 match(Set dst (URShiftVS src shift));
6034 effect(TEMP dst, USE src, USE shift);
6035 format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %}
6036 ins_encode %{
6037 int opcode = this->ideal_Opcode();
6038 if (UseAVX > 0) {
6039 int vlen_enc = vector_length_encoding(this);
6040 __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6041 } else {
6042 int vlen = vector_length(this);
6043 if (vlen == 2) {
6044 __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6045 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6046 } else if (vlen == 4) {
6047 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6048 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6049 } else {
6050 assert (vlen == 8, "sanity");
6051 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6052 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6053 }
6054 }
6055 %}
6056 ins_pipe( pipe_slow );
6057 %}
6058
6059 // Integers vector left shift
6060 instruct vshiftI(vec dst, vec src, vec shift) %{
6061 predicate(VectorNode::is_vshift_cnt(n->in(2)));
6062 match(Set dst ( LShiftVI src shift));
6063 match(Set dst ( RShiftVI src shift));
6064 match(Set dst (URShiftVI src shift));
6065 effect(TEMP dst, USE src, USE shift);
6066 format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %}
6067 ins_encode %{
6068 int opcode = this->ideal_Opcode();
6069 if (UseAVX > 0) {
6070 int vlen_enc = vector_length_encoding(this);
6071 __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6072 } else {
6073 int vlen = vector_length(this);
6074 if (vlen == 2) {
6075 __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6076 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6077 } else {
6078 assert(vlen == 4, "sanity");
6079 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6080 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6081 }
6082 }
6083 %}
6084 ins_pipe( pipe_slow );
6085 %}
6086
6087 // Longs vector shift
6088 instruct vshiftL(vec dst, vec src, vec shift) %{
6089 predicate(VectorNode::is_vshift_cnt(n->in(2)));
6090 match(Set dst ( LShiftVL src shift));
6091 match(Set dst (URShiftVL src shift));
6092 effect(TEMP dst, USE src, USE shift);
6093 format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %}
6094 ins_encode %{
6095 int opcode = this->ideal_Opcode();
6096 if (UseAVX > 0) {
6097 int vlen_enc = vector_length_encoding(this);
6098 __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6099 } else {
6100 assert(vector_length(this) == 2, "");
6101 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6102 __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6103 }
6104 %}
6105 ins_pipe( pipe_slow );
6106 %}
6107
6108 // -------------------ArithmeticRightShift -----------------------------------
6109 // Long vector arithmetic right shift
6110 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6111 predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6112 match(Set dst (RShiftVL src shift));
6113 effect(TEMP dst, TEMP tmp, TEMP scratch);
6114 format %{ "vshiftq $dst,$src,$shift" %}
6115 ins_encode %{
6116 uint vlen = vector_length(this);
6117 if (vlen == 2) {
6118 assert(UseSSE >= 2, "required");
6119 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6120 __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6121 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6122 __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6123 __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6124 __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6125 } else {
6126 assert(vlen == 4, "sanity");
6127 assert(UseAVX > 1, "required");
6128 int vlen_enc = Assembler::AVX_256bit;
6129 __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6130 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6131 __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6132 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6133 __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6134 }
6135 %}
6136 ins_pipe( pipe_slow );
6137 %}
6138
6139 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6140 predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6141 match(Set dst (RShiftVL src shift));
6142 format %{ "vshiftq $dst,$src,$shift" %}
6143 ins_encode %{
6144 int vlen_enc = vector_length_encoding(this);
6145 __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6146 %}
6147 ins_pipe( pipe_slow );
6148 %}
6149
6150 // ------------------- Variable Shift -----------------------------
6151 // Byte variable shift
6152 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6153 predicate(vector_length(n) <= 8 &&
6154 !VectorNode::is_vshift_cnt(n->in(2)) &&
6155 !VM_Version::supports_avx512bw());
6156 match(Set dst ( LShiftVB src shift));
6157 match(Set dst ( RShiftVB src shift));
6158 match(Set dst (URShiftVB src shift));
6159 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6160 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6161 ins_encode %{
6162 assert(UseAVX >= 2, "required");
6163
6164 int opcode = this->ideal_Opcode();
6165 int vlen_enc = Assembler::AVX_128bit;
6166 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6167 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6168 %}
6169 ins_pipe( pipe_slow );
6170 %}
6171
6172 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6173 predicate(vector_length(n) == 16 &&
6174 !VectorNode::is_vshift_cnt(n->in(2)) &&
6175 !VM_Version::supports_avx512bw());
6176 match(Set dst ( LShiftVB src shift));
6177 match(Set dst ( RShiftVB src shift));
6178 match(Set dst (URShiftVB src shift));
6179 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6180 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6181 ins_encode %{
6182 assert(UseAVX >= 2, "required");
6183
6184 int opcode = this->ideal_Opcode();
6185 int vlen_enc = Assembler::AVX_128bit;
6186 // Shift lower half and get word result in dst
6187 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6188
6189 // Shift upper half and get word result in vtmp1
6190 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6191 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6192 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6193
6194 // Merge and down convert the two word results to byte in dst
6195 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6196 %}
6197 ins_pipe( pipe_slow );
6198 %}
6199
6200 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6201 predicate(vector_length(n) == 32 &&
6202 !VectorNode::is_vshift_cnt(n->in(2)) &&
6203 !VM_Version::supports_avx512bw());
6204 match(Set dst ( LShiftVB src shift));
6205 match(Set dst ( RShiftVB src shift));
6206 match(Set dst (URShiftVB src shift));
6207 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6208 format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6209 ins_encode %{
6210 assert(UseAVX >= 2, "required");
6211
6212 int opcode = this->ideal_Opcode();
6213 int vlen_enc = Assembler::AVX_128bit;
6214 // Process lower 128 bits and get result in dst
6215 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6216 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6217 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6218 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6219 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6220
6221 // Process higher 128 bits and get result in vtmp3
6222 __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6223 __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6224 __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6225 __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6226 __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6227 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6228 __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6229
6230 // Merge the two results in dst
6231 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6232 %}
6233 ins_pipe( pipe_slow );
6234 %}
6235
6236 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6237 predicate(vector_length(n) <= 32 &&
6238 !VectorNode::is_vshift_cnt(n->in(2)) &&
6239 VM_Version::supports_avx512bw());
6240 match(Set dst ( LShiftVB src shift));
6241 match(Set dst ( RShiftVB src shift));
6242 match(Set dst (URShiftVB src shift));
6243 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6244 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6245 ins_encode %{
6246 assert(UseAVX > 2, "required");
6247
6248 int opcode = this->ideal_Opcode();
6249 int vlen_enc = vector_length_encoding(this);
6250 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6251 %}
6252 ins_pipe( pipe_slow );
6253 %}
6254
6255 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6256 predicate(vector_length(n) == 64 &&
6257 !VectorNode::is_vshift_cnt(n->in(2)) &&
6258 VM_Version::supports_avx512bw());
6259 match(Set dst ( LShiftVB src shift));
6260 match(Set dst ( RShiftVB src shift));
6261 match(Set dst (URShiftVB src shift));
6262 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6263 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6264 ins_encode %{
6265 assert(UseAVX > 2, "required");
6266
6267 int opcode = this->ideal_Opcode();
6268 int vlen_enc = Assembler::AVX_256bit;
6269 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6270 __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6271 __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6272 __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6273 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6274 %}
6275 ins_pipe( pipe_slow );
6276 %}
6277
6278 // Short variable shift
6279 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6280 predicate(vector_length(n) <= 8 &&
6281 !VectorNode::is_vshift_cnt(n->in(2)) &&
6282 !VM_Version::supports_avx512bw());
6283 match(Set dst ( LShiftVS src shift));
6284 match(Set dst ( RShiftVS src shift));
6285 match(Set dst (URShiftVS src shift));
6286 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6287 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6288 ins_encode %{
6289 assert(UseAVX >= 2, "required");
6290
6291 int opcode = this->ideal_Opcode();
6292 bool sign = (opcode != Op_URShiftVS);
6293 int vlen_enc = Assembler::AVX_256bit;
6294 __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6295 __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6296 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6297 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6298 __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6299 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6300 %}
6301 ins_pipe( pipe_slow );
6302 %}
6303
6304 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6305 predicate(vector_length(n) == 16 &&
6306 !VectorNode::is_vshift_cnt(n->in(2)) &&
6307 !VM_Version::supports_avx512bw());
6308 match(Set dst ( LShiftVS src shift));
6309 match(Set dst ( RShiftVS src shift));
6310 match(Set dst (URShiftVS src shift));
6311 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6312 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6313 ins_encode %{
6314 assert(UseAVX >= 2, "required");
6315
6316 int opcode = this->ideal_Opcode();
6317 bool sign = (opcode != Op_URShiftVS);
6318 int vlen_enc = Assembler::AVX_256bit;
6319 // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6320 __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6321 __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6322 __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6323 __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6324
6325 // Shift upper half, with result in dst usign vtmp1 as TEMP
6326 __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6327 __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6328 __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6329 __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6330 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6331 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6332
6333 // Merge lower and upper half result into dst
6334 __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6335 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6336 %}
6337 ins_pipe( pipe_slow );
6338 %}
6339
6340 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6341 predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6342 VM_Version::supports_avx512bw());
6343 match(Set dst ( LShiftVS src shift));
6344 match(Set dst ( RShiftVS src shift));
6345 match(Set dst (URShiftVS src shift));
6346 format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6347 ins_encode %{
6348 assert(UseAVX > 2, "required");
6349
6350 int opcode = this->ideal_Opcode();
6351 int vlen_enc = vector_length_encoding(this);
6352 if (!VM_Version::supports_avx512vl()) {
6353 vlen_enc = Assembler::AVX_512bit;
6354 }
6355 __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6356 %}
6357 ins_pipe( pipe_slow );
6358 %}
6359
6360 //Integer variable shift
6361 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6362 predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6363 match(Set dst ( LShiftVI src shift));
6364 match(Set dst ( RShiftVI src shift));
6365 match(Set dst (URShiftVI src shift));
6366 format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6367 ins_encode %{
6368 assert(UseAVX >= 2, "required");
6369
6370 int opcode = this->ideal_Opcode();
6371 int vlen_enc = vector_length_encoding(this);
6372 __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6373 %}
6374 ins_pipe( pipe_slow );
6375 %}
6376
6377 //Long variable shift
6378 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6379 predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6380 match(Set dst ( LShiftVL src shift));
6381 match(Set dst (URShiftVL src shift));
6382 format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6383 ins_encode %{
6384 assert(UseAVX >= 2, "required");
6385
6386 int opcode = this->ideal_Opcode();
6387 int vlen_enc = vector_length_encoding(this);
6388 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6389 %}
6390 ins_pipe( pipe_slow );
6391 %}
6392
6393 //Long variable right shift arithmetic
6394 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6395 predicate(vector_length(n) <= 4 &&
6396 !VectorNode::is_vshift_cnt(n->in(2)) &&
6397 UseAVX == 2);
6398 match(Set dst (RShiftVL src shift));
6399 effect(TEMP dst, TEMP vtmp);
6400 format %{ "vector_varshift_long $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6401 ins_encode %{
6402 int opcode = this->ideal_Opcode();
6403 int vlen_enc = vector_length_encoding(this);
6404 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6405 $vtmp$$XMMRegister);
6406 %}
6407 ins_pipe( pipe_slow );
6408 %}
6409
6410 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6411 predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6412 UseAVX > 2);
6413 match(Set dst (RShiftVL src shift));
6414 format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6415 ins_encode %{
6416 int opcode = this->ideal_Opcode();
6417 int vlen_enc = vector_length_encoding(this);
6418 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6419 %}
6420 ins_pipe( pipe_slow );
6421 %}
6422
6423 // --------------------------------- AND --------------------------------------
6424
6425 instruct vand(vec dst, vec src) %{
6426 predicate(UseAVX == 0);
6427 match(Set dst (AndV dst src));
6428 format %{ "pand $dst,$src\t! and vectors" %}
6429 ins_encode %{
6430 __ pand($dst$$XMMRegister, $src$$XMMRegister);
6431 %}
6432 ins_pipe( pipe_slow );
6433 %}
6434
6435 instruct vand_reg(vec dst, vec src1, vec src2) %{
6436 predicate(UseAVX > 0);
6437 match(Set dst (AndV src1 src2));
6438 format %{ "vpand $dst,$src1,$src2\t! and vectors" %}
6439 ins_encode %{
6440 int vlen_enc = vector_length_encoding(this);
6441 __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6442 %}
6443 ins_pipe( pipe_slow );
6444 %}
6445
6446 instruct vand_mem(vec dst, vec src, memory mem) %{
6447 predicate(UseAVX > 0);
6448 match(Set dst (AndV src (LoadVector mem)));
6449 format %{ "vpand $dst,$src,$mem\t! and vectors" %}
6450 ins_encode %{
6451 int vlen_enc = vector_length_encoding(this);
6452 __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6453 %}
6454 ins_pipe( pipe_slow );
6455 %}
6456
6457 // --------------------------------- OR ---------------------------------------
6458
6459 instruct vor(vec dst, vec src) %{
6460 predicate(UseAVX == 0);
6461 match(Set dst (OrV dst src));
6462 format %{ "por $dst,$src\t! or vectors" %}
6463 ins_encode %{
6464 __ por($dst$$XMMRegister, $src$$XMMRegister);
6465 %}
6466 ins_pipe( pipe_slow );
6467 %}
6468
6469 instruct vor_reg(vec dst, vec src1, vec src2) %{
6470 predicate(UseAVX > 0);
6471 match(Set dst (OrV src1 src2));
6472 format %{ "vpor $dst,$src1,$src2\t! or vectors" %}
6473 ins_encode %{
6474 int vlen_enc = vector_length_encoding(this);
6475 __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6476 %}
6477 ins_pipe( pipe_slow );
6478 %}
6479
6480 instruct vor_mem(vec dst, vec src, memory mem) %{
6481 predicate(UseAVX > 0);
6482 match(Set dst (OrV src (LoadVector mem)));
6483 format %{ "vpor $dst,$src,$mem\t! or vectors" %}
6484 ins_encode %{
6485 int vlen_enc = vector_length_encoding(this);
6486 __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6487 %}
6488 ins_pipe( pipe_slow );
6489 %}
6490
6491 // --------------------------------- XOR --------------------------------------
6492
6493 instruct vxor(vec dst, vec src) %{
6494 predicate(UseAVX == 0);
6495 match(Set dst (XorV dst src));
6496 format %{ "pxor $dst,$src\t! xor vectors" %}
6497 ins_encode %{
6498 __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6499 %}
6500 ins_pipe( pipe_slow );
6501 %}
6502
6503 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6504 predicate(UseAVX > 0);
6505 match(Set dst (XorV src1 src2));
6506 format %{ "vpxor $dst,$src1,$src2\t! xor vectors" %}
6507 ins_encode %{
6508 int vlen_enc = vector_length_encoding(this);
6509 __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6510 %}
6511 ins_pipe( pipe_slow );
6512 %}
6513
6514 instruct vxor_mem(vec dst, vec src, memory mem) %{
6515 predicate(UseAVX > 0);
6516 match(Set dst (XorV src (LoadVector mem)));
6517 format %{ "vpxor $dst,$src,$mem\t! xor vectors" %}
6518 ins_encode %{
6519 int vlen_enc = vector_length_encoding(this);
6520 __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6521 %}
6522 ins_pipe( pipe_slow );
6523 %}
6524
6525 // --------------------------------- VectorCast --------------------------------------
6526
6527 instruct vcastBtoX(vec dst, vec src) %{
6528 match(Set dst (VectorCastB2X src));
6529 format %{ "vector_cast_b2x $dst,$src\t!" %}
6530 ins_encode %{
6531 assert(UseAVX > 0, "required");
6532
6533 BasicType to_elem_bt = vector_element_basic_type(this);
6534 int vlen_enc = vector_length_encoding(this);
6535 switch (to_elem_bt) {
6536 case T_SHORT:
6537 __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6538 break;
6539 case T_INT:
6540 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6541 break;
6542 case T_FLOAT:
6543 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6544 __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6545 break;
6546 case T_LONG:
6547 __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6548 break;
6549 case T_DOUBLE:
6550 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6551 __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6552 break;
6553
6554 default: assert(false, "%s", type2name(to_elem_bt));
6555 }
6556 %}
6557 ins_pipe( pipe_slow );
6558 %}
6559
6560 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6561 predicate(UseAVX <= 2 &&
6562 vector_length(n->in(1)) <= 8 && // src
6563 vector_element_basic_type(n) == T_BYTE);
6564 effect(TEMP scratch);
6565 match(Set dst (VectorCastS2X src));
6566 format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6567 ins_encode %{
6568 assert(UseAVX > 0, "required");
6569
6570 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6571 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6572 %}
6573 ins_pipe( pipe_slow );
6574 %}
6575
6576 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6577 predicate(UseAVX <= 2 &&
6578 vector_length(n->in(1)) == 16 && // src
6579 vector_element_basic_type(n) == T_BYTE);
6580 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6581 match(Set dst (VectorCastS2X src));
6582 format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6583 ins_encode %{
6584 assert(UseAVX > 0, "required");
6585
6586 int vlen_enc = vector_length_encoding(vector_length_in_bytes(this, $src));
6587 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6588 __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6589 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6590 %}
6591 ins_pipe( pipe_slow );
6592 %}
6593
6594 instruct vcastStoX_evex(vec dst, vec src) %{
6595 predicate(UseAVX > 2 ||
6596 (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src
6597 match(Set dst (VectorCastS2X src));
6598 format %{ "vector_cast_s2x $dst,$src\t!" %}
6599 ins_encode %{
6600 BasicType to_elem_bt = vector_element_basic_type(this);
6601 int src_vlen_enc = vector_length_encoding(this, $src);
6602 int vlen_enc = vector_length_encoding(this);
6603 switch (to_elem_bt) {
6604 case T_BYTE:
6605 if (!VM_Version::supports_avx512vl()) {
6606 vlen_enc = Assembler::AVX_512bit;
6607 }
6608 __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6609 break;
6610 case T_INT:
6611 __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6612 break;
6613 case T_FLOAT:
6614 __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6615 __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6616 break;
6617 case T_LONG:
6618 __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6619 break;
6620 case T_DOUBLE:
6621 __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6622 __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6623 break;
6624 default:
6625 ShouldNotReachHere();
6626 }
6627 %}
6628 ins_pipe( pipe_slow );
6629 %}
6630
6631 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6632 predicate(UseAVX <= 2 &&
6633 (vector_length_in_bytes(n->in(1)) <= 16) &&
6634 (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src
6635 match(Set dst (VectorCastI2X src));
6636 format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6637 effect(TEMP scratch);
6638 ins_encode %{
6639 assert(UseAVX > 0, "required");
6640
6641 BasicType to_elem_bt = vector_element_basic_type(this);
6642 int vlen_enc = vector_length_encoding(this, $src);
6643
6644 if (to_elem_bt == T_BYTE) {
6645 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6646 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6647 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6648 } else {
6649 assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6650 __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6651 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6652 }
6653 %}
6654 ins_pipe( pipe_slow );
6655 %}
6656
6657 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6658 predicate(UseAVX <= 2 &&
6659 (vector_length_in_bytes(n->in(1)) == 32) &&
6660 (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src
6661 match(Set dst (VectorCastI2X src));
6662 format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
6663 effect(TEMP dst, TEMP vtmp, TEMP scratch);
6664 ins_encode %{
6665 assert(UseAVX > 0, "required");
6666
6667 BasicType to_elem_bt = vector_element_basic_type(this);
6668 int vlen_enc = vector_length_encoding(this, $src);
6669
6670 if (to_elem_bt == T_BYTE) {
6671 __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6672 __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6673 __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6674 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6675 } else {
6676 assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6677 __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6678 __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6679 __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6680 }
6681 %}
6682 ins_pipe( pipe_slow );
6683 %}
6684
6685 instruct vcastItoX_evex(vec dst, vec src) %{
6686 predicate(UseAVX > 2 ||
6687 (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src
6688 match(Set dst (VectorCastI2X src));
6689 format %{ "vector_cast_i2x $dst,$src\t!" %}
6690 ins_encode %{
6691 assert(UseAVX > 0, "required");
6692
6693 BasicType dst_elem_bt = vector_element_basic_type(this);
6694 int src_vlen_enc = vector_length_encoding(this, $src);
6695 int dst_vlen_enc = vector_length_encoding(this);
6696 switch (dst_elem_bt) {
6697 case T_BYTE:
6698 if (!VM_Version::supports_avx512vl()) {
6699 src_vlen_enc = Assembler::AVX_512bit;
6700 }
6701 __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6702 break;
6703 case T_SHORT:
6704 if (!VM_Version::supports_avx512vl()) {
6705 src_vlen_enc = Assembler::AVX_512bit;
6706 }
6707 __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6708 break;
6709 case T_FLOAT:
6710 __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
6711 break;
6712 case T_LONG:
6713 __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6714 break;
6715 case T_DOUBLE:
6716 __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
6717 break;
6718 default:
6719 ShouldNotReachHere();
6720 }
6721 %}
6722 ins_pipe( pipe_slow );
6723 %}
6724
6725 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
6726 predicate((vector_element_basic_type(n) == T_BYTE || vector_element_basic_type(n) == T_SHORT) &&
6727 UseAVX <= 2);
6728 match(Set dst (VectorCastL2X src));
6729 effect(TEMP scratch);
6730 format %{ "vector_cast_l2x $dst,$src\t! using $scratch as TEMP" %}
6731 ins_encode %{
6732 assert(UseAVX > 0, "required");
6733
6734 int vlen = vector_length_in_bytes(this, $src);
6735 BasicType to_elem_bt = vector_element_basic_type(this);
6736 AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
6737 : ExternalAddress(vector_int_to_short_mask());
6738 if (vlen <= 16) {
6739 __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
6740 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6741 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6742 } else {
6743 assert(vlen <= 32, "required");
6744 __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
6745 __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
6746 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6747 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6748 }
6749 if (to_elem_bt == T_BYTE) {
6750 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6751 }
6752 %}
6753 ins_pipe( pipe_slow );
6754 %}
6755
6756 instruct vcastLtoX_evex(vec dst, vec src) %{
6757 predicate(UseAVX > 2 ||
6758 (vector_element_basic_type(n) == T_INT ||
6759 vector_element_basic_type(n) == T_FLOAT ||
6760 vector_element_basic_type(n) == T_DOUBLE));
6761 match(Set dst (VectorCastL2X src));
6762 format %{ "vector_cast_l2x $dst,$src\t!" %}
6763 ins_encode %{
6764 BasicType to_elem_bt = vector_element_basic_type(this);
6765 int vlen = vector_length_in_bytes(this, $src);
6766 int vlen_enc = vector_length_encoding(this, $src);
6767 switch (to_elem_bt) {
6768 case T_BYTE:
6769 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6770 vlen_enc = Assembler::AVX_512bit;
6771 }
6772 __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6773 break;
6774 case T_SHORT:
6775 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6776 vlen_enc = Assembler::AVX_512bit;
6777 }
6778 __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6779 break;
6780 case T_INT:
6781 if (vlen == 8) {
6782 if ($dst$$XMMRegister != $src$$XMMRegister) {
6783 __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6784 }
6785 } else if (vlen == 16) {
6786 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
6787 } else if (vlen == 32) {
6788 if (UseAVX > 2) {
6789 if (!VM_Version::supports_avx512vl()) {
6790 vlen_enc = Assembler::AVX_512bit;
6791 }
6792 __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6793 } else {
6794 __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
6795 __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
6796 }
6797 } else { // vlen == 64
6798 __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6799 }
6800 break;
6801 case T_FLOAT:
6802 assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6803 __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6804 break;
6805 case T_DOUBLE:
6806 assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6807 __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6808 break;
6809
6810 default: assert(false, "%s", type2name(to_elem_bt));
6811 }
6812 %}
6813 ins_pipe( pipe_slow );
6814 %}
6815
6816 instruct vcastFtoD_reg(vec dst, vec src) %{
6817 predicate(vector_element_basic_type(n) == T_DOUBLE);
6818 match(Set dst (VectorCastF2X src));
6819 format %{ "vector_cast_f2x $dst,$src\t!" %}
6820 ins_encode %{
6821 int vlen_enc = vector_length_encoding(this);
6822 __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6823 %}
6824 ins_pipe( pipe_slow );
6825 %}
6826
6827 instruct vcastDtoF_reg(vec dst, vec src) %{
6828 predicate(vector_element_basic_type(n) == T_FLOAT);
6829 match(Set dst (VectorCastD2X src));
6830 format %{ "vector_cast_d2x $dst,$src\t!" %}
6831 ins_encode %{
6832 int vlen_enc = vector_length_encoding(this, $src);
6833 __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6834 %}
6835 ins_pipe( pipe_slow );
6836 %}
6837
6838 // --------------------------------- VectorMaskCmp --------------------------------------
6839
6840 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6841 predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1
6842 vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6843 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6844 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6845 format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6846 ins_encode %{
6847 int vlen_enc = vector_length_encoding(this, $src1);
6848 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6849 if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6850 __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6851 } else {
6852 __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6853 }
6854 %}
6855 ins_pipe( pipe_slow );
6856 %}
6857
6858 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{
6859 predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6860 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6861 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6862 effect(TEMP scratch);
6863 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6864 ins_encode %{
6865 int vlen_enc = Assembler::AVX_512bit;
6866 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6867 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
6868 KRegister mask = k0; // The comparison itself is not being masked.
6869 if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6870 __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6871 __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6872 } else {
6873 __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6874 __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6875 }
6876 %}
6877 ins_pipe( pipe_slow );
6878 %}
6879
6880 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
6881 predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1
6882 vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6883 is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6884 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6885 effect(TEMP scratch);
6886 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6887 ins_encode %{
6888 int vlen_enc = vector_length_encoding(this, $src1);
6889 Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6890 Assembler::Width ww = widthForType(vector_element_basic_type(this, $src1));
6891 __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
6892 %}
6893 ins_pipe( pipe_slow );
6894 %}
6895
6896 instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{
6897 predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6898 is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6899 match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6900 effect(TEMP scratch);
6901 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6902 ins_encode %{
6903 assert(UseAVX > 2, "required");
6904
6905 int vlen_enc = Assembler::AVX_512bit;
6906 Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6907 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
6908 KRegister mask = k0; // The comparison itself is not being masked.
6909 bool merge = false;
6910 BasicType src1_elem_bt = vector_element_basic_type(this, $src1);
6911
6912 switch (src1_elem_bt) {
6913 case T_BYTE: {
6914 __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6915 __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6916 break;
6917 }
6918 case T_SHORT: {
6919 __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6920 __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6921 break;
6922 }
6923 case T_INT: {
6924 __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6925 __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6926 break;
6927 }
6928 case T_LONG: {
6929 __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6930 __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
6931 break;
6932 }
6933
6934 default: assert(false, "%s", type2name(src1_elem_bt));
6935 }
6936 %}
6937 ins_pipe( pipe_slow );
6938 %}
6939
6940 // Extract
6941
6942 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
6943 predicate(vector_length_in_bytes(n->in(1)) <= 16); // src
6944 match(Set dst (ExtractI src idx));
6945 match(Set dst (ExtractS src idx));
6946 #ifdef _LP64
6947 match(Set dst (ExtractB src idx));
6948 #endif
6949 ins_encode %{
6950 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6951
6952 BasicType elem_bt = vector_element_basic_type(this, $src);
6953 __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
6954 %}
6955 ins_pipe( pipe_slow );
6956 %}
6957
6958 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
6959 predicate(vector_length_in_bytes(n->in(1)) == 32 || // src
6960 vector_length_in_bytes(n->in(1)) == 64); // src
6961 match(Set dst (ExtractI src idx));
6962 match(Set dst (ExtractS src idx));
6963 #ifdef _LP64
6964 match(Set dst (ExtractB src idx));
6965 #endif
6966 effect(TEMP vtmp);
6967 ins_encode %{
6968 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6969
6970 BasicType elem_bt = vector_element_basic_type(this, $src);
6971 XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6972 __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
6973 %}
6974 ins_pipe( pipe_slow );
6975 %}
6976
6977 #ifdef _LP64
6978 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
6979 predicate(vector_length(n->in(1)) <= 2); // src
6980 match(Set dst (ExtractL src idx));
6981 ins_encode %{
6982 assert(UseSSE >= 4, "required");
6983 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6984
6985 __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
6986 %}
6987 ins_pipe( pipe_slow );
6988 %}
6989
6990 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
6991 predicate(vector_length(n->in(1)) == 4 || // src
6992 vector_length(n->in(1)) == 8); // src
6993 match(Set dst (ExtractL src idx));
6994 effect(TEMP vtmp);
6995 ins_encode %{
6996 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
6997
6998 XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
6999 __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7000 %}
7001 ins_pipe( pipe_slow );
7002 %}
7003 #endif
7004
7005 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7006 predicate(vector_length(n->in(1)) <= 4);
7007 match(Set dst (ExtractF src idx));
7008 effect(TEMP dst, TEMP tmp, TEMP vtmp);
7009 ins_encode %{
7010 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7011
7012 __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7013 %}
7014 ins_pipe( pipe_slow );
7015 %}
7016
7017 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7018 predicate(vector_length(n->in(1)/*src*/) == 8 ||
7019 vector_length(n->in(1)/*src*/) == 16);
7020 match(Set dst (ExtractF src idx));
7021 effect(TEMP tmp, TEMP vtmp);
7022 ins_encode %{
7023 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7024
7025 XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7026 __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7027 %}
7028 ins_pipe( pipe_slow );
7029 %}
7030
7031 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7032 predicate(vector_length(n->in(1)) == 2); // src
7033 match(Set dst (ExtractD src idx));
7034 ins_encode %{
7035 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7036
7037 __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7038 %}
7039 ins_pipe( pipe_slow );
7040 %}
7041
7042 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7043 predicate(vector_length(n->in(1)) == 4 || // src
7044 vector_length(n->in(1)) == 8); // src
7045 match(Set dst (ExtractD src idx));
7046 effect(TEMP vtmp);
7047 ins_encode %{
7048 assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7049
7050 XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7051 __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7052 %}
7053 ins_pipe( pipe_slow );
7054 %}
7055
7056 // --------------------------------- Vector Blend --------------------------------------
7057
7058 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7059 predicate(UseAVX == 0);
7060 match(Set dst (VectorBlend (Binary dst src) mask));
7061 format %{ "vector_blend $dst,$src,$mask\t! using $tmp as TEMP" %}
7062 effect(TEMP tmp);
7063 ins_encode %{
7064 assert(UseSSE >= 4, "required");
7065
7066 if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7067 __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7068 }
7069 __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7070 %}
7071 ins_pipe( pipe_slow );
7072 %}
7073
7074 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7075 predicate(UseAVX > 0 &&
7076 vector_length_in_bytes(n) <= 32 &&
7077 is_integral_type(vector_element_basic_type(n)));
7078 match(Set dst (VectorBlend (Binary src1 src2) mask));
7079 format %{ "vector_blend $dst,$src1,$src2,$mask\t!" %}
7080 ins_encode %{
7081 int vlen_enc = vector_length_encoding(this);
7082 __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7083 %}
7084 ins_pipe( pipe_slow );
7085 %}
7086
7087 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7088 predicate(UseAVX > 0 &&
7089 vector_length_in_bytes(n) <= 32 &&
7090 !is_integral_type(vector_element_basic_type(n)));
7091 match(Set dst (VectorBlend (Binary src1 src2) mask));
7092 format %{ "vector_blend $dst,$src1,$src2,$mask\t!" %}
7093 ins_encode %{
7094 int vlen_enc = vector_length_encoding(this);
7095 __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7096 %}
7097 ins_pipe( pipe_slow );
7098 %}
7099
7100 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch) %{
7101 predicate(vector_length_in_bytes(n) == 64);
7102 match(Set dst (VectorBlend (Binary src1 src2) mask));
7103 format %{ "vector_blend $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7104 effect(TEMP scratch);
7105 ins_encode %{
7106 int vlen_enc = Assembler::AVX_512bit;
7107 BasicType elem_bt = vector_element_basic_type(this);
7108 KRegister ktmp = k2;
7109 __ evpcmp(elem_bt, ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7110 __ evpblend(elem_bt, $dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7111 %}
7112 ins_pipe( pipe_slow );
7113 %}
7114
7115 // --------------------------------- ABS --------------------------------------
7116 // a = |a|
7117 instruct vabsB_reg(vec dst, vec src) %{
7118 match(Set dst (AbsVB src));
7119 format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7120 ins_encode %{
7121 uint vlen = vector_length(this);
7122 if (vlen <= 16) {
7123 __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7124 } else {
7125 int vlen_enc = vector_length_encoding(this);
7126 __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7127 }
7128 %}
7129 ins_pipe( pipe_slow );
7130 %}
7131
7132 instruct vabsS_reg(vec dst, vec src) %{
7133 match(Set dst (AbsVS src));
7134 format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7135 ins_encode %{
7136 uint vlen = vector_length(this);
7137 if (vlen <= 8) {
7138 __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7139 } else {
7140 int vlen_enc = vector_length_encoding(this);
7141 __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7142 }
7143 %}
7144 ins_pipe( pipe_slow );
7145 %}
7146
7147 instruct vabsI_reg(vec dst, vec src) %{
7148 match(Set dst (AbsVI src));
7149 format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7150 ins_encode %{
7151 uint vlen = vector_length(this);
7152 if (vlen <= 4) {
7153 __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7154 } else {
7155 int vlen_enc = vector_length_encoding(this);
7156 __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7157 }
7158 %}
7159 ins_pipe( pipe_slow );
7160 %}
7161
7162 instruct vabsL_reg(vec dst, vec src) %{
7163 match(Set dst (AbsVL src));
7164 format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7165 ins_encode %{
7166 assert(UseAVX > 2, "required");
7167 int vlen_enc = vector_length_encoding(this);
7168 if (!VM_Version::supports_avx512vl()) {
7169 vlen_enc = Assembler::AVX_512bit;
7170 }
7171 __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7172 %}
7173 ins_pipe( pipe_slow );
7174 %}
7175
7176 // --------------------------------- ABSNEG --------------------------------------
7177
7178 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7179 predicate(vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7180 match(Set dst (AbsVF src));
7181 match(Set dst (NegVF src));
7182 effect(TEMP scratch);
7183 format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7184 ins_cost(150);
7185 ins_encode %{
7186 int opcode = this->ideal_Opcode();
7187 int vlen = vector_length(this);
7188 if (vlen == 2) {
7189 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7190 } else {
7191 assert(vlen == 8 || vlen == 16, "required");
7192 int vlen_enc = vector_length_encoding(this);
7193 __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7194 }
7195 %}
7196 ins_pipe( pipe_slow );
7197 %}
7198
7199 instruct vabsneg4F(vec dst, rRegI scratch) %{
7200 predicate(vector_length(n) == 4);
7201 match(Set dst (AbsVF dst));
7202 match(Set dst (NegVF dst));
7203 effect(TEMP scratch);
7204 format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7205 ins_cost(150);
7206 ins_encode %{
7207 int opcode = this->ideal_Opcode();
7208 __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7209 %}
7210 ins_pipe( pipe_slow );
7211 %}
7212
7213 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7214 match(Set dst (AbsVD src));
7215 match(Set dst (NegVD src));
7216 effect(TEMP scratch);
7217 format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7218 ins_encode %{
7219 int opcode = this->ideal_Opcode();
7220 uint vlen = vector_length(this);
7221 if (vlen == 2) {
7222 assert(UseSSE >= 2, "required");
7223 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7224 } else {
7225 int vlen_enc = vector_length_encoding(this);
7226 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7227 }
7228 %}
7229 ins_pipe( pipe_slow );
7230 %}
7231
7232 //------------------------------------- VectorTest --------------------------------------------
7233
7234 #ifdef _LP64
7235 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7236 predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7237 match(Set dst (VectorTest src1 src2 ));
7238 effect(KILL cr);
7239 format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7240 ins_encode %{
7241 int vlen = vector_length_in_bytes(this, $src1);
7242 int vlen_enc = vector_length_encoding(vlen);
7243 if (vlen <= 32) {
7244 if (UseAVX == 0) {
7245 assert(vlen <= 16, "required");
7246 __ ptest($src1$$XMMRegister, $src2$$XMMRegister);
7247 } else {
7248 __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7249 }
7250 } else {
7251 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
7252 __ evpcmpeqb(ktmp, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7253 __ kortestql(ktmp, ktmp);
7254 }
7255 __ setb(Assembler::carrySet, $dst$$Register);
7256 __ movzbl($dst$$Register, $dst$$Register);
7257 %}
7258 ins_pipe( pipe_slow );
7259 %}
7260
7261 instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7262 predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7263 match(Set dst (VectorTest src1 src2 ));
7264 effect(KILL cr);
7265 format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7266 ins_encode %{
7267 int vlen = vector_length_in_bytes(this, $src1);
7268 int vlen_enc = vector_length_encoding(vlen);
7269 if (vlen <= 32) {
7270 if (UseAVX == 0) {
7271 assert(vlen <= 16, "required");
7272 __ ptest($src1$$XMMRegister, $src2$$XMMRegister);
7273 } else {
7274 __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7275 }
7276 } else {
7277 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
7278 __ evpcmpeqb(ktmp, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7279 __ ktestql(ktmp, ktmp);
7280 }
7281 __ setb(Assembler::notZero, $dst$$Register);
7282 __ movzbl($dst$$Register, $dst$$Register);
7283 %}
7284 ins_pipe( pipe_slow );
7285 %}
7286 #endif
7287
7288 //------------------------------------- LoadMask --------------------------------------------
7289
7290 instruct loadMask(vec dst, vec src) %{
7291 match(Set dst (VectorLoadMask src));
7292 effect(TEMP dst);
7293 format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7294 ins_encode %{
7295 int vlen_in_bytes = vector_length_in_bytes(this);
7296 BasicType elem_bt = vector_element_basic_type(this);
7297
7298 __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt);
7299 %}
7300 ins_pipe( pipe_slow );
7301 %}
7302
7303 //------------------------------------- StoreMask --------------------------------------------
7304
7305 instruct storeMask1B(vec dst, vec src, immI_1 size) %{
7306 predicate(vector_length(n) < 64 || VM_Version::supports_avx512vlbw());
7307 match(Set dst (VectorStoreMask src size));
7308 format %{ "vector_store_mask $dst,$src\t!" %}
7309 ins_encode %{
7310 assert(UseSSE >= 3, "required");
7311 if (vector_length_in_bytes(this) <= 16) {
7312 __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7313 } else {
7314 assert(UseAVX >= 2, "required");
7315 int src_vlen_enc = vector_length_encoding(this, $src);
7316 __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7317 }
7318 %}
7319 ins_pipe( pipe_slow );
7320 %}
7321
7322 instruct storeMask2B(vec dst, vec src, immI_2 size) %{
7323 predicate(vector_length(n) <= 8);
7324 match(Set dst (VectorStoreMask src size));
7325 format %{ "vector_store_mask $dst,$src\n\t" %}
7326 ins_encode %{
7327 assert(UseSSE >= 3, "required");
7328 __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7329 __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7330 %}
7331 ins_pipe( pipe_slow );
7332 %}
7333
7334 instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{
7335 predicate(vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7336 match(Set dst (VectorStoreMask src size));
7337 effect(TEMP dst);
7338 format %{ "vector_store_mask $dst,$src\t!" %}
7339 ins_encode %{
7340 int vlen_enc = Assembler::AVX_128bit;
7341 __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7342 __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc);
7343 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7344 %}
7345 ins_pipe( pipe_slow );
7346 %}
7347
7348 instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{
7349 predicate(VM_Version::supports_avx512bw());
7350 match(Set dst (VectorStoreMask src size));
7351 format %{ "vector_store_mask $dst,$src\t!" %}
7352 ins_encode %{
7353 int src_vlen_enc = vector_length_encoding(this, $src);
7354 int dst_vlen_enc = vector_length_encoding(this);
7355 __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7356 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7357 %}
7358 ins_pipe( pipe_slow );
7359 %}
7360
7361 instruct storeMask4B(vec dst, vec src, immI_4 size) %{
7362 predicate (vector_length(n) <= 4 && UseAVX <= 2);
7363 match(Set dst (VectorStoreMask src size));
7364 format %{ "vector_store_mask $dst,$src\t!" %}
7365 ins_encode %{
7366 assert(UseSSE >= 3, "required");
7367 __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7368 __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7369 __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7370 %}
7371 ins_pipe( pipe_slow );
7372 %}
7373
7374 instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{
7375 predicate(vector_length(n) == 8 && UseAVX <= 2);
7376 match(Set dst (VectorStoreMask src size));
7377 format %{ "vector_store_mask $dst,$src\t!" %}
7378 effect(TEMP dst);
7379 ins_encode %{
7380 int vlen_enc = Assembler::AVX_128bit;
7381 __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7382 __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7383 __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7384 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7385 %}
7386 ins_pipe( pipe_slow );
7387 %}
7388
7389 instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{
7390 predicate(UseAVX > 2);
7391 match(Set dst (VectorStoreMask src size));
7392 format %{ "vector_store_mask $dst,$src\t!" %}
7393 ins_encode %{
7394 int src_vlen_enc = vector_length_encoding(this, $src);
7395 int dst_vlen_enc = vector_length_encoding(this);
7396 if (!VM_Version::supports_avx512vl()) {
7397 src_vlen_enc = Assembler::AVX_512bit;
7398 }
7399 __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7400 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7401 %}
7402 ins_pipe( pipe_slow );
7403 %}
7404
7405 instruct storeMask8B(vec dst, vec src, immI_8 size) %{
7406 predicate(vector_length(n) == 2 && UseAVX <= 2);
7407 match(Set dst (VectorStoreMask src size));
7408 format %{ "vector_store_mask $dst,$src\t!" %}
7409 ins_encode %{
7410 assert(UseSSE >= 3, "required");
7411 __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7412 __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7413 __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7414 __ pabsb($dst$$XMMRegister, $dst$$XMMRegister);
7415 %}
7416 ins_pipe( pipe_slow );
7417 %}
7418
7419 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{
7420 predicate(vector_length(n) == 4 && UseAVX <= 2);
7421 match(Set dst (VectorStoreMask src size));
7422 format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %}
7423 effect(TEMP dst, TEMP vtmp);
7424 ins_encode %{
7425 int vlen_enc = Assembler::AVX_128bit;
7426 __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7427 __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7428 __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7429 __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7430 __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7431 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7432 %}
7433 ins_pipe( pipe_slow );
7434 %}
7435
7436 instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{
7437 predicate(UseAVX > 2);
7438 match(Set dst (VectorStoreMask src size));
7439 format %{ "vector_store_mask $dst,$src\t!" %}
7440 ins_encode %{
7441 int src_vlen_enc = vector_length_encoding(this, $src);
7442 int dst_vlen_enc = vector_length_encoding(this);
7443 if (!VM_Version::supports_avx512vl()) {
7444 src_vlen_enc = Assembler::AVX_512bit;
7445 }
7446 __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7447 __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7448 %}
7449 ins_pipe( pipe_slow );
7450 %}
7451
7452 //-------------------------------- Load Iota Indices ----------------------------------
7453
7454 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
7455 predicate(vector_element_basic_type(n) == T_BYTE);
7456 match(Set dst (VectorLoadConst src));
7457 effect(TEMP scratch);
7458 format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
7459 ins_encode %{
7460 int vlen_in_bytes = vector_length_in_bytes(this);
7461 __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
7462 %}
7463 ins_pipe( pipe_slow );
7464 %}
7465
7466 //-------------------------------- Rearrange ----------------------------------
7467
7468 // LoadShuffle/Rearrange for Byte
7469
7470 instruct loadShuffleB(vec dst) %{
7471 predicate(vector_element_basic_type(n) == T_BYTE);
7472 match(Set dst (VectorLoadShuffle dst));
7473 format %{ "vector_load_shuffle $dst, $dst" %}
7474 ins_encode %{
7475 // empty
7476 %}
7477 ins_pipe( pipe_slow );
7478 %}
7479
7480 instruct rearrangeB(vec dst, vec shuffle) %{
7481 predicate(vector_element_basic_type(n) == T_BYTE &&
7482 vector_length(n) < 32);
7483 match(Set dst (VectorRearrange dst shuffle));
7484 format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7485 ins_encode %{
7486 assert(UseSSE >= 4, "required");
7487 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7488 %}
7489 ins_pipe( pipe_slow );
7490 %}
7491
7492 instruct rearrangeB_avx(vec dst, vec src, vec shuffle) %{
7493 predicate(vector_element_basic_type(n) == T_BYTE &&
7494 vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
7495 match(Set dst (VectorRearrange src shuffle));
7496 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7497 ins_encode %{
7498 __ vpshufb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, Assembler::AVX_256bit);
7499 %}
7500 ins_pipe( pipe_slow );
7501 %}
7502
7503 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
7504 predicate(vector_element_basic_type(n) == T_BYTE &&
7505 vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
7506 match(Set dst (VectorRearrange src shuffle));
7507 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7508 ins_encode %{
7509 int vlen_enc = vector_length_encoding(this);
7510 __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7511 %}
7512 ins_pipe( pipe_slow );
7513 %}
7514
7515 // LoadShuffle/Rearrange for Short
7516
7517 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
7518 predicate(vector_element_basic_type(n) == T_SHORT &&
7519 vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7520 match(Set dst (VectorLoadShuffle src));
7521 effect(TEMP dst, TEMP vtmp, TEMP scratch);
7522 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7523 ins_encode %{
7524 // Create a byte shuffle mask from short shuffle mask
7525 // only byte shuffle instruction available on these platforms
7526
7527 // Multiply each shuffle by two to get byte index
7528 __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7529 __ psllw($vtmp$$XMMRegister, 1);
7530
7531 // Duplicate to create 2 copies of byte index
7532 __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7533 __ psllw($dst$$XMMRegister, 8);
7534 __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7535
7536 // Add one to get alternate byte index
7537 __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7538 __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7539 %}
7540 ins_pipe( pipe_slow );
7541 %}
7542
7543 instruct rearrangeS(vec dst, vec shuffle) %{
7544 predicate(vector_element_basic_type(n) == T_SHORT &&
7545 vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
7546 match(Set dst (VectorRearrange dst shuffle));
7547 format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7548 ins_encode %{
7549 assert(UseSSE >= 4, "required");
7550 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7551 %}
7552 ins_pipe( pipe_slow );
7553 %}
7554
7555 instruct loadShuffleS_evex(vec dst, vec src) %{
7556 predicate(vector_element_basic_type(n) == T_SHORT &&
7557 VM_Version::supports_avx512bw());
7558 match(Set dst (VectorLoadShuffle src));
7559 format %{ "vector_load_shuffle $dst, $src" %}
7560 ins_encode %{
7561 int vlen_enc = vector_length_encoding(this);
7562 if (!VM_Version::supports_avx512vl()) {
7563 vlen_enc = Assembler::AVX_512bit;
7564 }
7565 __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7566 %}
7567 ins_pipe( pipe_slow );
7568 %}
7569
7570 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
7571 predicate(vector_element_basic_type(n) == T_SHORT &&
7572 VM_Version::supports_avx512bw());
7573 match(Set dst (VectorRearrange src shuffle));
7574 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7575 ins_encode %{
7576 int vlen_enc = vector_length_encoding(this);
7577 if (!VM_Version::supports_avx512vl()) {
7578 vlen_enc = Assembler::AVX_512bit;
7579 }
7580 __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7581 %}
7582 ins_pipe( pipe_slow );
7583 %}
7584
7585 // LoadShuffle/Rearrange for Integer and Float
7586
7587 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
7588 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7589 vector_length(n) == 4 && UseAVX < 2);
7590 match(Set dst (VectorLoadShuffle src));
7591 effect(TEMP dst, TEMP vtmp, TEMP scratch);
7592 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7593 ins_encode %{
7594 assert(UseSSE >= 4, "required");
7595
7596 // Create a byte shuffle mask from int shuffle mask
7597 // only byte shuffle instruction available on these platforms
7598
7599 // Duplicate and multiply each shuffle by 4
7600 __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
7601 __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7602 __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7603 __ psllw($vtmp$$XMMRegister, 2);
7604
7605 // Duplicate again to create 4 copies of byte index
7606 __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7607 __ psllw($dst$$XMMRegister, 8);
7608 __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
7609
7610 // Add 3,2,1,0 to get alternate byte index
7611 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
7612 __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7613 %}
7614 ins_pipe( pipe_slow );
7615 %}
7616
7617 instruct rearrangeI(vec dst, vec shuffle) %{
7618 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7619 vector_length(n) == 4 && UseAVX < 2);
7620 match(Set dst (VectorRearrange dst shuffle));
7621 format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7622 ins_encode %{
7623 assert(UseSSE >= 4, "required");
7624 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7625 %}
7626 ins_pipe( pipe_slow );
7627 %}
7628
7629 instruct loadShuffleI_avx(vec dst, vec src) %{
7630 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7631 UseAVX >= 2);
7632 match(Set dst (VectorLoadShuffle src));
7633 format %{ "vector_load_shuffle $dst, $src" %}
7634 ins_encode %{
7635 int vlen_enc = vector_length_encoding(this);
7636 __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7637 %}
7638 ins_pipe( pipe_slow );
7639 %}
7640
7641 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
7642 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7643 UseAVX >= 2);
7644 match(Set dst (VectorRearrange src shuffle));
7645 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7646 ins_encode %{
7647 int vlen_enc = vector_length_encoding(this);
7648 if (vlen_enc == Assembler::AVX_128bit) {
7649 vlen_enc = Assembler::AVX_256bit;
7650 }
7651 __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7652 %}
7653 ins_pipe( pipe_slow );
7654 %}
7655
7656 // LoadShuffle/Rearrange for Long and Double
7657
7658 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
7659 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7660 vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7661 match(Set dst (VectorLoadShuffle src));
7662 effect(TEMP dst, TEMP vtmp, TEMP scratch);
7663 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7664 ins_encode %{
7665 assert(UseAVX >= 2, "required");
7666
7667 int vlen_enc = vector_length_encoding(this);
7668 // Create a double word shuffle mask from long shuffle mask
7669 // only double word shuffle instruction available on these platforms
7670
7671 // Multiply each shuffle by two to get double word index
7672 __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7673 __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7674
7675 // Duplicate each double word shuffle
7676 __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
7677 __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7678
7679 // Add one to get alternate double word index
7680 __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
7681 %}
7682 ins_pipe( pipe_slow );
7683 %}
7684
7685 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
7686 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7687 vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7688 match(Set dst (VectorRearrange src shuffle));
7689 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7690 ins_encode %{
7691 assert(UseAVX >= 2, "required");
7692
7693 int vlen_enc = vector_length_encoding(this);
7694 __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7695 %}
7696 ins_pipe( pipe_slow );
7697 %}
7698
7699 instruct loadShuffleL_evex(vec dst, vec src) %{
7700 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7701 (vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7702 match(Set dst (VectorLoadShuffle src));
7703 format %{ "vector_load_shuffle $dst, $src" %}
7704 ins_encode %{
7705 assert(UseAVX > 2, "required");
7706
7707 int vlen_enc = vector_length_encoding(this);
7708 __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7709 %}
7710 ins_pipe( pipe_slow );
7711 %}
7712
7713 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
7714 predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7715 (vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7716 match(Set dst (VectorRearrange src shuffle));
7717 format %{ "vector_rearrange $dst, $shuffle, $src" %}
7718 ins_encode %{
7719 assert(UseAVX > 2, "required");
7720
7721 int vlen_enc = vector_length_encoding(this);
7722 if (vlen_enc == Assembler::AVX_128bit) {
7723 vlen_enc = Assembler::AVX_256bit;
7724 }
7725 __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7726 %}
7727 ins_pipe( pipe_slow );
7728 %}
7729
7730 // --------------------------------- FMA --------------------------------------
7731 // a * b + c
7732
7733 instruct vfmaF_reg(vec a, vec b, vec c) %{
7734 match(Set c (FmaVF c (Binary a b)));
7735 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7736 ins_cost(150);
7737 ins_encode %{
7738 assert(UseFMA, "not enabled");
7739 int vlen_enc = vector_length_encoding(this);
7740 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
7741 %}
7742 ins_pipe( pipe_slow );
7743 %}
7744
7745 instruct vfmaF_mem(vec a, memory b, vec c) %{
7746 match(Set c (FmaVF c (Binary a (LoadVector b))));
7747 format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7748 ins_cost(150);
7749 ins_encode %{
7750 assert(UseFMA, "not enabled");
7751 int vlen_enc = vector_length_encoding(this);
7752 __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
7753 %}
7754 ins_pipe( pipe_slow );
7755 %}
7756
7757 instruct vfmaD_reg(vec a, vec b, vec c) %{
7758 match(Set c (FmaVD c (Binary a b)));
7759 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
7760 ins_cost(150);
7761 ins_encode %{
7762 assert(UseFMA, "not enabled");
7763 int vlen_enc = vector_length_encoding(this);
7764 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
7765 %}
7766 ins_pipe( pipe_slow );
7767 %}
7768
7769 instruct vfmaD_mem(vec a, memory b, vec c) %{
7770 match(Set c (FmaVD c (Binary a (LoadVector b))));
7771 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
7772 ins_cost(150);
7773 ins_encode %{
7774 assert(UseFMA, "not enabled");
7775 int vlen_enc = vector_length_encoding(this);
7776 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
7777 %}
7778 ins_pipe( pipe_slow );
7779 %}
7780
7781 // --------------------------------- Vector Multiply Add --------------------------------------
7782
7783 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
7784 predicate(UseAVX == 0);
7785 match(Set dst (MulAddVS2VI dst src1));
7786 format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
7787 ins_encode %{
7788 __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
7789 %}
7790 ins_pipe( pipe_slow );
7791 %}
7792
7793 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
7794 predicate(UseAVX > 0);
7795 match(Set dst (MulAddVS2VI src1 src2));
7796 format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
7797 ins_encode %{
7798 int vlen_enc = vector_length_encoding(this);
7799 __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7800 %}
7801 ins_pipe( pipe_slow );
7802 %}
7803
7804 // --------------------------------- Vector Multiply Add Add ----------------------------------
7805
7806 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
7807 predicate(VM_Version::supports_avx512_vnni());
7808 match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
7809 format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
7810 ins_encode %{
7811 assert(UseAVX > 2, "required");
7812 int vlen_enc = vector_length_encoding(this);
7813 __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7814 %}
7815 ins_pipe( pipe_slow );
7816 ins_cost(10);
7817 %}
7818
7819 // --------------------------------- PopCount --------------------------------------
7820
7821 instruct vpopcountI(vec dst, vec src) %{
7822 match(Set dst (PopCountVI src));
7823 format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
7824 ins_encode %{
7825 assert(UsePopCountInstruction, "not enabled");
7826
7827 int vlen_enc = vector_length_encoding(this);
7828 __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7829 %}
7830 ins_pipe( pipe_slow );
7831 %}
7832
7833 // --------------------------------- Bitwise Ternary Logic ----------------------------------
7834
7835 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
7836 match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
7837 effect(TEMP dst);
7838 format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
7839 ins_encode %{
7840 int vector_len = vector_length_encoding(this);
7841 __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
7842 %}
7843 ins_pipe( pipe_slow );
7844 %}
7845
7846 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
7847 match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
7848 effect(TEMP dst);
|