1149 return NativeJump::instruction_size; 1150 } 1151 1152 #ifdef _LP64 1153 static uint size_deopt_handler() { 1154 // three 5 byte instructions plus one move for unreachable address. 1155 return 15+3; 1156 } 1157 #else 1158 static uint size_deopt_handler() { 1159 // NativeCall instruction size is the same as NativeJump. 1160 // exception handler starts out as jump and can be patched to 1161 // a call be deoptimization. (4932387) 1162 // Note that this value is also credited (in output.cpp) to 1163 // the size of the code section. 1164 return 5 + NativeJump::instruction_size; // pushl(); jmp; 1165 } 1166 #endif 1167 }; 1168 1169 class Node::PD { 1170 public: 1171 enum NodeFlags { 1172 Flag_intel_jcc_erratum = Node::_last_flag << 1, 1173 _last_flag = Flag_intel_jcc_erratum 1174 }; 1175 }; 1176 1177 1178 inline uint vector_length(const Node* n) { 1179 const TypeVect* vt = n->bottom_type()->is_vect(); 1180 return vt->length(); 1181 } 1182 1183 inline uint vector_length(const MachNode* use, MachOper* opnd) { 1184 uint def_idx = use->operand_index(opnd); 1185 Node* def = use->in(def_idx); 1186 return def->bottom_type()->is_vect()->length(); 1187 } 1188 1189 inline uint vector_length_in_bytes(const Node* n) { 1190 const TypeVect* vt = n->bottom_type()->is_vect(); 1191 return vt->length_in_bytes(); 1192 } 1193 1194 inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) { 1195 uint def_idx = use->operand_index(opnd); 1196 Node* def = use->in(def_idx); 1215 case 32: return Assembler::AVX_256bit; 1216 case 64: return Assembler::AVX_512bit; 1217 1218 default: { 1219 ShouldNotReachHere(); 1220 return Assembler::AVX_NoVec; 1221 } 1222 } 1223 } 1224 1225 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) { 1226 return vector_length_encoding(vector_length_in_bytes(n)); 1227 } 1228 1229 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) { 1230 uint def_idx = use->operand_index(opnd); 1231 Node* def = use->in(def_idx); 1232 return vector_length_encoding(def); 1233 } 1234 1235 %} // end source_hpp 1236 1237 source %{ 1238 1239 #include "opto/addnode.hpp" 1240 #include "c2_intelJccErratum_x86.hpp" 1241 1242 void PhaseOutput::pd_perform_mach_node_analysis() { 1243 if (VM_Version::has_intel_jcc_erratum()) { 1244 int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc()); 1245 _buf_sizes._code += extra_padding; 1246 } 1247 } 1248 1249 int MachNode::pd_alignment_required() const { 1250 if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) { 1251 // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86. 1252 return IntelJccErratum::largest_jcc_size() + 1; 1253 } else { 1254 return 1; 1488 case Op_CacheWB: 1489 case Op_CacheWBPreSync: 1490 case Op_CacheWBPostSync: 1491 if (!VM_Version::supports_data_cache_line_flush()) { 1492 return false; 1493 } 1494 break; 1495 case Op_ExtractB: 1496 case Op_ExtractL: 1497 case Op_ExtractI: 1498 case Op_RoundDoubleMode: 1499 if (UseSSE < 4) { 1500 return false; 1501 } 1502 break; 1503 case Op_RoundDoubleModeV: 1504 if (VM_Version::supports_avx() == false) { 1505 return false; // 128bit vroundpd is not available 1506 } 1507 break; 1508 case Op_MacroLogicV: 1509 if (UseAVX < 3 || !UseVectorMacroLogic) { 1510 return false; 1511 } 1512 break; 1513 case Op_VLShiftV: 1514 case Op_VRShiftV: 1515 case Op_VURShiftV: 1516 case Op_LoadVectorGather: 1517 if (UseAVX < 2) { 1518 return false; 1519 } 1520 break; 1521 case Op_FmaVD: 1522 case Op_FmaVF: 1523 if (!UseFMA) { 1524 return false; 1525 } 1526 break; 1527 #ifndef _LP64 1528 case Op_AddReductionVF: 1529 case Op_AddReductionVD: 1530 case Op_MulReductionVF: 1531 case Op_MulReductionVD: 1532 if (UseSSE < 1) { // requires at least SSE 1533 return false; 1534 } 1535 break; 1536 case Op_MulAddVS2VI: 1537 case Op_RShiftVL: 1538 case Op_AbsVD: 1539 case Op_NegVD: 1540 if (UseSSE < 2) { 1541 return false; 1542 } 1543 break; 1544 #endif // !LP64 1545 } 1546 return true; // Match rules are supported by default. 1547 } 1548 1549 //------------------------------------------------------------------------ 1550 1551 // Identify extra cases that we might want to provide match rules for vector nodes and 1552 // other intrinsics guarded with vector length (vlen) and element type (bt). 1553 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { 1554 if (!match_rule_supported(opcode)) { 1555 return false; 1556 } 1557 // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes): 1558 // * SSE2 supports 128bit vectors for all types; 1559 // * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types; 1560 // * AVX2 supports 256bit vectors for all types; 1561 // * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types; 1562 // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types. 1563 // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE). 1564 // And MaxVectorSize is taken into account as well. 1565 1566 if (!vector_size_supported(bt, vlen)) { 1567 return false; 1568 } 1569 // Special cases which require vector length follow: 1570 // * implementation limitations 1571 // * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ 1572 // * 128bit vroundpd instruction is present only in AVX1 1573 int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte; 1574 switch (opcode) { 1575 case Op_AbsVF: 1576 case Op_NegVF: 1577 if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) { 1578 return false; // 512bit vandps and vxorps are not available 1579 } 1580 break; 1581 case Op_AbsVD: 1582 case Op_NegVD: 1583 case Op_MulVL: 1584 if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) { 1585 return false; // 512bit vpmullq, vandpd and vxorpd are not available 1778 case MoveVec2Leg_rule: 1779 case MoveLeg2Vec_rule: 1780 return true; 1781 default: 1782 return false; 1783 } 1784 } 1785 1786 bool Matcher::is_generic_vector(MachOper* opnd) { 1787 switch (opnd->opcode()) { 1788 case VEC: 1789 case LEGVEC: 1790 return true; 1791 default: 1792 return false; 1793 } 1794 } 1795 1796 //------------------------------------------------------------------------ 1797 1798 const bool Matcher::has_predicated_vectors(void) { 1799 bool ret_value = false; 1800 if (UseAVX > 2) { 1801 ret_value = VM_Version::supports_avx512vl(); 1802 } 1803 1804 return ret_value; 1805 } 1806 1807 const int Matcher::float_pressure(int default_pressure_threshold) { 1808 int float_pressure_threshold = default_pressure_threshold; 1809 #ifdef _LP64 1810 if (UseAVX > 2) { 1811 // Increase pressure threshold on machines with AVX3 which have 1812 // 2x more XMM registers. 1813 float_pressure_threshold = default_pressure_threshold * 2; 1814 } 1815 #endif 1816 return float_pressure_threshold; 1817 } 4176 %} 4177 4178 instruct ReplD_zero(vec dst, immD0 zero) %{ 4179 match(Set dst (ReplicateD zero)); 4180 format %{ "replicateD $dst,$zero" %} 4181 ins_encode %{ 4182 uint vlen = vector_length(this); 4183 if (vlen == 2) { 4184 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); 4185 } else { 4186 int vlen_enc = vector_length_encoding(this); 4187 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ 4188 } 4189 %} 4190 ins_pipe( fpu_reg_reg ); 4191 %} 4192 4193 // ====================VECTOR INSERT======================================= 4194 4195 instruct insert(vec dst, rRegI val, immU8 idx) %{ 4196 predicate(vector_length_in_bytes(n) >= 8 && 4197 vector_length_in_bytes(n) <= 16); 4198 match(Set dst (VectorInsert (Binary dst val) idx)); 4199 format %{ "vector_insert $dst,$val,$idx" %} 4200 ins_encode %{ 4201 assert(UseSSE >= 4, "required"); 4202 4203 BasicType elem_bt = vector_element_basic_type(this); 4204 4205 assert(is_integral_type(elem_bt), ""); 4206 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4207 4208 __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant); 4209 %} 4210 ins_pipe( pipe_slow ); 4211 %} 4212 4213 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{ 4214 predicate(vector_length_in_bytes(n) == 32); 4215 match(Set dst (VectorInsert (Binary src val) idx)); 4216 effect(TEMP vtmp); 4217 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4218 ins_encode %{ 4219 int vlen_enc = Assembler::AVX_256bit; 4220 BasicType elem_bt = vector_element_basic_type(this); 4221 int elem_per_lane = 16/type2aelembytes(elem_bt); 4222 int log2epr = log2(elem_per_lane); 4223 4224 assert(is_integral_type(elem_bt), "sanity"); 4225 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4226 4227 uint x_idx = $idx$$constant & right_n_bits(log2epr); 4228 uint y_idx = ($idx$$constant >> log2epr) & 1; 4229 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4230 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4231 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4232 %} 4233 ins_pipe( pipe_slow ); 4234 %} 4235 4236 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{ 4237 predicate(vector_length_in_bytes(n) == 64); 4238 match(Set dst (VectorInsert (Binary src val) idx)); 4239 effect(TEMP vtmp); 4240 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4241 ins_encode %{ 4242 assert(UseAVX > 2, "sanity"); 4243 4244 BasicType elem_bt = vector_element_basic_type(this); 4245 int elem_per_lane = 16/type2aelembytes(elem_bt); 4246 int log2epr = log2(elem_per_lane); 4247 4248 assert(is_integral_type(elem_bt), ""); 4249 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4250 4251 uint x_idx = $idx$$constant & right_n_bits(log2epr); 4252 uint y_idx = ($idx$$constant >> log2epr) & 3; 4253 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4254 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4255 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4256 %} 4257 ins_pipe( pipe_slow ); 4258 %} 4259 4260 #ifdef _LP64 4261 instruct insert2L(vec dst, rRegL val, immU8 idx) %{ 4262 predicate(vector_length(n) == 2); 4263 match(Set dst (VectorInsert (Binary dst val) idx)); 4264 format %{ "vector_insert $dst,$val,$idx" %} 4265 ins_encode %{ 4266 assert(UseSSE >= 4, "required"); 4267 assert(vector_element_basic_type(this) == T_LONG, ""); 4268 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4269 4270 __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant); 4271 %} 4272 ins_pipe( pipe_slow ); 4273 %} 4274 4275 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{ 4276 predicate(vector_length(n) == 4); 4277 match(Set dst (VectorInsert (Binary src val) idx)); 4278 effect(TEMP vtmp); 4279 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4280 ins_encode %{ 4281 assert(vector_element_basic_type(this) == T_LONG, ""); 4282 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4283 4284 uint x_idx = $idx$$constant & right_n_bits(1); 4285 uint y_idx = ($idx$$constant >> 1) & 1; 4286 int vlen_enc = Assembler::AVX_256bit; 4287 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4288 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4289 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4290 %} 4291 ins_pipe( pipe_slow ); 4292 %} 4293 4294 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{ 4295 predicate(vector_length(n) == 8); 4296 match(Set dst (VectorInsert (Binary src val) idx)); 4297 effect(TEMP vtmp); 4298 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4299 ins_encode %{ 4300 assert(vector_element_basic_type(this) == T_LONG, "sanity"); 4301 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4302 4303 uint x_idx = $idx$$constant & right_n_bits(1); 4304 uint y_idx = ($idx$$constant >> 1) & 3; 4305 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4306 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4307 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4308 %} 4309 ins_pipe( pipe_slow ); 4310 %} 4311 #endif 4312 4313 instruct insertF(vec dst, regF val, immU8 idx) %{ 4314 predicate(vector_length(n) >= 2 && 4315 vector_length(n) <= 4); 4316 match(Set dst (VectorInsert (Binary dst val) idx)); 4317 format %{ "vector_insert $dst,$val,$idx" %} 4318 ins_encode %{ 4319 assert(UseSSE >= 4, "sanity"); 4320 4321 assert(vector_element_basic_type(this) == T_FLOAT, "sanity"); 4322 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4323 4324 __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant); 4325 %} 4326 ins_pipe( pipe_slow ); 4327 %} 4328 4329 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{ 4330 predicate(vector_length(n) >= 8); 4331 match(Set dst (VectorInsert (Binary src val) idx)); 4332 effect(TEMP vtmp); 4333 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4334 ins_encode %{ 4335 assert(vector_element_basic_type(this) == T_FLOAT, "sanity"); 4336 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4337 4338 int vlen = vector_length(this); 4339 uint x_idx = $idx$$constant & right_n_bits(2); 4340 if (vlen == 8) { 4341 uint y_idx = ($idx$$constant >> 2) & 1; 4342 int vlen_enc = Assembler::AVX_256bit; 4343 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4344 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx); 4345 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4346 } else { 4347 assert(vlen == 16, "sanity"); 4348 uint y_idx = ($idx$$constant >> 2) & 3; 4349 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4350 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx); 4351 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4352 } 4353 %} 4354 ins_pipe( pipe_slow ); 4355 %} 4356 4357 #ifdef _LP64 4358 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{ 4359 predicate(vector_length(n) == 2); 4360 match(Set dst (VectorInsert (Binary dst val) idx)); 4361 effect(TEMP tmp); 4362 format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %} 4363 ins_encode %{ 4364 assert(UseSSE >= 4, "sanity"); 4365 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); 4366 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4367 4368 __ movq($tmp$$Register, $val$$XMMRegister); 4369 __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant); 4370 %} 4371 ins_pipe( pipe_slow ); 4372 %} 4373 4374 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{ 4375 predicate(vector_length(n) == 4); 4376 match(Set dst (VectorInsert (Binary src val) idx)); 4377 effect(TEMP vtmp, TEMP tmp); 4378 format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %} 4379 ins_encode %{ 4380 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); 4381 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4382 4383 uint x_idx = $idx$$constant & right_n_bits(1); 4384 uint y_idx = ($idx$$constant >> 1) & 1; 4385 int vlen_enc = Assembler::AVX_256bit; 4386 __ movq($tmp$$Register, $val$$XMMRegister); 4387 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4388 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx); 4389 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4390 %} 4391 ins_pipe( pipe_slow ); 4392 %} 4393 4394 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{ 4395 predicate(vector_length(n) == 8); 4396 match(Set dst (VectorInsert (Binary src val) idx)); 4397 effect(TEMP tmp, TEMP vtmp); 4398 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4399 ins_encode %{ 4400 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); 4401 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4402 4403 uint x_idx = $idx$$constant & right_n_bits(1); 4404 uint y_idx = ($idx$$constant >> 1) & 3; 4405 __ movq($tmp$$Register, $val$$XMMRegister); 4406 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4407 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx); 4408 __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4409 %} 4410 ins_pipe( pipe_slow ); 4411 %} 4412 #endif 4413 4414 // ====================REDUCTION ARITHMETIC======================================= 4415 4416 // =======================Int Reduction========================================== 4417 4418 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ 4419 predicate(vector_element_basic_type(n->in(2)) == T_INT && 4420 vector_length(n->in(2)) < 16); // src2 4421 match(Set dst (AddReductionVI src1 src2)); 4422 match(Set dst (MulReductionVI src1 src2)); 4423 match(Set dst (AndReductionV src1 src2)); 4424 match(Set dst ( OrReductionV src1 src2)); 4425 match(Set dst (XorReductionV src1 src2)); 4426 match(Set dst (MinReductionV src1 src2)); 4427 match(Set dst (MaxReductionV src1 src2)); 4428 effect(TEMP vtmp1, TEMP vtmp2); 5887 %} 5888 ins_pipe( pipe_slow ); 5889 %} 5890 5891 // ------------------------------ Shift --------------------------------------- 5892 5893 // Left and right shift count vectors are the same on x86 5894 // (only lowest bits of xmm reg are used for count). 5895 instruct vshiftcnt(vec dst, rRegI cnt) %{ 5896 match(Set dst (LShiftCntV cnt)); 5897 match(Set dst (RShiftCntV cnt)); 5898 format %{ "movdl $dst,$cnt\t! load shift count" %} 5899 ins_encode %{ 5900 __ movdl($dst$$XMMRegister, $cnt$$Register); 5901 %} 5902 ins_pipe( pipe_slow ); 5903 %} 5904 5905 // Byte vector shift 5906 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 5907 predicate(vector_length(n) <= 8); 5908 match(Set dst ( LShiftVB src shift)); 5909 match(Set dst ( RShiftVB src shift)); 5910 match(Set dst (URShiftVB src shift)); 5911 effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch); 5912 format %{"vector_byte_shift $dst,$src,$shift" %} 5913 ins_encode %{ 5914 assert(UseSSE > 3, "required"); 5915 int opcode = this->ideal_Opcode(); 5916 bool sign = (opcode == Op_URShiftVB) ? false : true; 5917 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister); 5918 __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister); 5919 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 5920 __ pand($dst$$XMMRegister, $tmp$$XMMRegister); 5921 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); 5922 %} 5923 ins_pipe( pipe_slow ); 5924 %} 5925 5926 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{ 5927 predicate(vector_length(n) == 16 && UseAVX <= 1); 5928 match(Set dst ( LShiftVB src shift)); 5929 match(Set dst ( RShiftVB src shift)); 5930 match(Set dst (URShiftVB src shift)); 5931 effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch); 5932 format %{"vector_byte_shift $dst,$src,$shift" %} 5933 ins_encode %{ 5934 assert(UseSSE > 3, "required"); 5935 int opcode = this->ideal_Opcode(); 5936 bool sign = (opcode == Op_URShiftVB) ? false : true; 5937 __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister); 5938 __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister); 5939 __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE); 5940 __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister); 5941 __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister); 5942 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 5943 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister); 5944 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister); 5945 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister); 5946 %} 5947 ins_pipe( pipe_slow ); 5948 %} 5949 5950 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 5951 predicate(vector_length(n) == 16 && UseAVX > 1); 5952 match(Set dst ( LShiftVB src shift)); 5953 match(Set dst ( RShiftVB src shift)); 5954 match(Set dst (URShiftVB src shift)); 5955 effect(TEMP dst, TEMP tmp, TEMP scratch); 5956 format %{"vector_byte_shift $dst,$src,$shift" %} 5957 ins_encode %{ 5958 int opcode = this->ideal_Opcode(); 5959 bool sign = (opcode == Op_URShiftVB) ? false : true; 5960 int vlen_enc = Assembler::AVX_256bit; 5961 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc); 5962 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); 5963 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); 5964 __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister); 5965 __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0); 5966 %} 5967 ins_pipe( pipe_slow ); 5968 %} 5969 5970 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 5971 predicate(vector_length(n) == 32); 5972 match(Set dst ( LShiftVB src shift)); 5973 match(Set dst ( RShiftVB src shift)); 5974 match(Set dst (URShiftVB src shift)); 5975 effect(TEMP dst, TEMP tmp, TEMP scratch); 5976 format %{"vector_byte_shift $dst,$src,$shift" %} 5977 ins_encode %{ 5978 assert(UseAVX > 1, "required"); 5979 int opcode = this->ideal_Opcode(); 5980 bool sign = (opcode == Op_URShiftVB) ? false : true; 5981 int vlen_enc = Assembler::AVX_256bit; 5982 __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister); 5983 __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 5984 __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc); 5985 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); 5986 __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc); 5987 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); 5988 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); 5989 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 5990 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); 5991 %} 5992 ins_pipe( pipe_slow ); 5993 %} 5994 5995 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{ 5996 predicate(vector_length(n) == 64); 5997 match(Set dst ( LShiftVB src shift)); 5998 match(Set dst (RShiftVB src shift)); 5999 match(Set dst (URShiftVB src shift)); 6000 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); 6001 format %{"vector_byte_shift $dst,$src,$shift" %} 6002 ins_encode %{ 6003 assert(UseAVX > 2, "required"); 6004 int opcode = this->ideal_Opcode(); 6005 bool sign = (opcode == Op_URShiftVB) ? false : true; 6006 int vlen_enc = Assembler::AVX_512bit; 6007 __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1); 6008 __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); 6009 __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc); 6010 __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6011 __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6012 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 6013 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6014 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6015 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6016 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc); 6017 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register); 6018 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6019 %} 6020 ins_pipe( pipe_slow ); 6021 %} 6022 6023 // Shorts vector logical right shift produces incorrect Java result 6024 // for negative data because java code convert short value into int with 6025 // sign extension before a shift. But char vectors are fine since chars are 6026 // unsigned values. 6027 // Shorts/Chars vector left shift 6028 instruct vshiftS(vec dst, vec src, vec shift) %{ 6029 match(Set dst ( LShiftVS src shift)); 6030 match(Set dst ( RShiftVS src shift)); 6031 match(Set dst (URShiftVS src shift)); 6032 effect(TEMP dst, USE src, USE shift); 6033 format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %} 6034 ins_encode %{ 6035 int opcode = this->ideal_Opcode(); 6036 if (UseAVX > 0) { 6037 int vlen_enc = vector_length_encoding(this); 6038 __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6039 } else { 6040 int vlen = vector_length(this); 6041 if (vlen == 2) { 6042 __ movflt($dst$$XMMRegister, $src$$XMMRegister); 6043 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6044 } else if (vlen == 4) { 6045 __ movdbl($dst$$XMMRegister, $src$$XMMRegister); 6046 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6047 } else { 6048 assert (vlen == 8, "sanity"); 6049 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6050 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6051 } 6052 } 6053 %} 6054 ins_pipe( pipe_slow ); 6055 %} 6056 6057 // Integers vector left shift 6058 instruct vshiftI(vec dst, vec src, vec shift) %{ 6059 match(Set dst ( LShiftVI src shift)); 6060 match(Set dst ( RShiftVI src shift)); 6061 match(Set dst (URShiftVI src shift)); 6062 effect(TEMP dst, USE src, USE shift); 6063 format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %} 6064 ins_encode %{ 6065 int opcode = this->ideal_Opcode(); 6066 if (UseAVX > 0) { 6067 int vlen_enc = vector_length_encoding(this); 6068 __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6069 } else { 6070 int vlen = vector_length(this); 6071 if (vlen == 2) { 6072 __ movdbl($dst$$XMMRegister, $src$$XMMRegister); 6073 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6074 } else { 6075 assert(vlen == 4, "sanity"); 6076 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6077 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6078 } 6079 } 6080 %} 6081 ins_pipe( pipe_slow ); 6082 %} 6083 6084 // Longs vector shift 6085 instruct vshiftL(vec dst, vec src, vec shift) %{ 6086 match(Set dst ( LShiftVL src shift)); 6087 match(Set dst (URShiftVL src shift)); 6088 effect(TEMP dst, USE src, USE shift); 6089 format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %} 6090 ins_encode %{ 6091 int opcode = this->ideal_Opcode(); 6092 if (UseAVX > 0) { 6093 int vlen_enc = vector_length_encoding(this); 6094 __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6095 } else { 6096 assert(vector_length(this) == 2, ""); 6097 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6098 __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6099 } 6100 %} 6101 ins_pipe( pipe_slow ); 6102 %} 6103 6104 // -------------------ArithmeticRightShift ----------------------------------- 6105 // Long vector arithmetic right shift 6106 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 6107 predicate(UseAVX <= 2); 6108 match(Set dst (RShiftVL src shift)); 6109 effect(TEMP dst, TEMP tmp, TEMP scratch); 6110 format %{ "vshiftq $dst,$src,$shift" %} 6111 ins_encode %{ 6112 uint vlen = vector_length(this); 6113 if (vlen == 2) { 6114 assert(UseSSE >= 2, "required"); 6115 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6116 __ psrlq($dst$$XMMRegister, $shift$$XMMRegister); 6117 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); 6118 __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister); 6119 __ pxor($dst$$XMMRegister, $tmp$$XMMRegister); 6120 __ psubq($dst$$XMMRegister, $tmp$$XMMRegister); 6121 } else { 6122 assert(vlen == 4, "sanity"); 6123 assert(UseAVX > 1, "required"); 6124 int vlen_enc = Assembler::AVX_256bit; 6125 __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6126 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); 6127 __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6128 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 6129 __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 6130 } 6131 %} 6132 ins_pipe( pipe_slow ); 6133 %} 6134 6135 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{ 6136 predicate(UseAVX > 2); 6137 match(Set dst (RShiftVL src shift)); 6138 format %{ "vshiftq $dst,$src,$shift" %} 6139 ins_encode %{ 6140 int vlen_enc = vector_length_encoding(this); 6141 __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6142 %} 6143 ins_pipe( pipe_slow ); 6144 %} 6145 6146 // ------------------- Variable Shift ----------------------------- 6147 // Byte variable shift 6148 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ 6149 predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_BYTE && 6150 !VM_Version::supports_avx512bw()); 6151 match(Set dst ( VLShiftV src shift)); 6152 match(Set dst ( VRShiftV src shift)); 6153 match(Set dst (VURShiftV src shift)); 6154 effect(TEMP dst, TEMP vtmp, TEMP scratch); 6155 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %} 6156 ins_encode %{ 6157 assert(UseAVX >= 2, "required"); 6158 6159 int opcode = this->ideal_Opcode(); 6160 int vlen_enc = Assembler::AVX_128bit; 6161 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register); 6162 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0); 6163 %} 6164 ins_pipe( pipe_slow ); 6165 %} 6166 6167 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ 6168 predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_BYTE && 6169 !VM_Version::supports_avx512bw()); 6170 match(Set dst ( VLShiftV src shift)); 6171 match(Set dst ( VRShiftV src shift)); 6172 match(Set dst (VURShiftV src shift)); 6173 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); 6174 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %} 6175 ins_encode %{ 6176 assert(UseAVX >= 2, "required"); 6177 6178 int opcode = this->ideal_Opcode(); 6179 int vlen_enc = Assembler::AVX_128bit; 6180 // Shift lower half and get word result in dst 6181 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); 6182 6183 // Shift upper half and get word result in vtmp1 6184 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0); 6185 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0); 6186 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6187 6188 // Merge and down convert the two word results to byte in dst 6189 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0); 6190 %} 6191 ins_pipe( pipe_slow ); 6192 %} 6193 6194 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{ 6195 predicate(vector_length(n) == 32 && vector_element_basic_type(n) == T_BYTE && 6196 !VM_Version::supports_avx512bw()); 6197 match(Set dst ( VLShiftV src shift)); 6198 match(Set dst ( VRShiftV src shift)); 6199 match(Set dst (VURShiftV src shift)); 6200 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch); 6201 format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %} 6202 ins_encode %{ 6203 assert(UseAVX >= 2, "required"); 6204 6205 int opcode = this->ideal_Opcode(); 6206 int vlen_enc = Assembler::AVX_128bit; 6207 // Process lower 128 bits and get result in dst 6208 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); 6209 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0); 6210 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0); 6211 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6212 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0); 6213 6214 // Process higher 128 bits and get result in vtmp3 6215 __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister); 6216 __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister); 6217 __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register); 6218 __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0); 6219 __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0); 6220 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6221 __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0); 6222 6223 // Merge the two results in dst 6224 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1); 6225 %} 6226 ins_pipe( pipe_slow ); 6227 %} 6228 6229 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ 6230 predicate(vector_length(n) <= 32 && vector_element_basic_type(n) == T_BYTE && 6231 VM_Version::supports_avx512bw()); 6232 match(Set dst ( VLShiftV src shift)); 6233 match(Set dst ( VRShiftV src shift)); 6234 match(Set dst (VURShiftV src shift)); 6235 effect(TEMP dst, TEMP vtmp, TEMP scratch); 6236 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %} 6237 ins_encode %{ 6238 assert(UseAVX > 2, "required"); 6239 6240 int opcode = this->ideal_Opcode(); 6241 int vlen_enc = vector_length_encoding(this); 6242 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register); 6243 %} 6244 ins_pipe( pipe_slow ); 6245 %} 6246 6247 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ 6248 predicate(vector_length(n) == 64 && vector_element_basic_type(n) == T_BYTE && 6249 VM_Version::supports_avx512bw()); 6250 match(Set dst ( VLShiftV src shift)); 6251 match(Set dst ( VRShiftV src shift)); 6252 match(Set dst (VURShiftV src shift)); 6253 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); 6254 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %} 6255 ins_encode %{ 6256 assert(UseAVX > 2, "required"); 6257 6258 int opcode = this->ideal_Opcode(); 6259 int vlen_enc = Assembler::AVX_256bit; 6260 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); 6261 __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister); 6262 __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister); 6263 __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6264 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1); 6265 %} 6266 ins_pipe( pipe_slow ); 6267 %} 6268 6269 // Short variable shift 6270 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ 6271 predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_SHORT && 6272 !VM_Version::supports_avx512bw()); 6273 match(Set dst (VLShiftV src shift)); 6274 match(Set dst (VRShiftV src shift)); 6275 match(Set dst (VURShiftV src shift)); 6276 effect(TEMP dst, TEMP vtmp, TEMP scratch); 6277 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %} 6278 ins_encode %{ 6279 assert(UseAVX >= 2, "required"); 6280 6281 int opcode = this->ideal_Opcode(); 6282 bool sign = (opcode == Op_VURShiftV) ? false : true; 6283 int vlen_enc = Assembler::AVX_256bit; 6284 __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1); 6285 __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1); 6286 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); 6287 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); 6288 __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister); 6289 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0); 6290 %} 6291 ins_pipe( pipe_slow ); 6292 %} 6293 6294 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ 6295 predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_SHORT && 6296 !VM_Version::supports_avx512bw()); 6297 match(Set dst (VLShiftV src shift)); 6298 match(Set dst (VRShiftV src shift)); 6299 match(Set dst (VURShiftV src shift)); 6300 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); 6301 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %} 6302 ins_encode %{ 6303 assert(UseAVX >= 2, "required"); 6304 6305 int opcode = this->ideal_Opcode(); 6306 bool sign = (opcode == Op_VURShiftV) ? false : true; 6307 int vlen_enc = Assembler::AVX_256bit; 6308 // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP 6309 __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc); 6310 __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6311 __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); 6312 __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); 6313 6314 // Shift upper half, with result in dst usign vtmp1 as TEMP 6315 __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister); 6316 __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister); 6317 __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6318 __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); 6319 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); 6320 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); 6321 6322 // Merge lower and upper half result into dst 6323 __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6324 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); 6325 %} 6326 ins_pipe( pipe_slow ); 6327 %} 6328 6329 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{ 6330 predicate(vector_element_basic_type(n) == T_SHORT && 6331 VM_Version::supports_avx512bw()); 6332 match(Set dst (VLShiftV src shift)); 6333 match(Set dst (VRShiftV src shift)); 6334 match(Set dst (VURShiftV src shift)); 6335 format %{ "vector_varshift_short $dst,$src,$shift\t!" %} 6336 ins_encode %{ 6337 assert(UseAVX > 2, "required"); 6338 6339 int opcode = this->ideal_Opcode(); 6340 int vlen_enc = vector_length_encoding(this); 6341 if (!VM_Version::supports_avx512vl()) { 6342 vlen_enc = Assembler::AVX_512bit; 6343 } 6344 __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6345 %} 6346 ins_pipe( pipe_slow ); 6347 %} 6348 6349 //Integer variable shift 6350 instruct vshiftI_var(vec dst, vec src, vec shift) %{ 6351 predicate(vector_element_basic_type(n) == T_INT); 6352 match(Set dst ( VLShiftV src shift)); 6353 match(Set dst ( VRShiftV src shift)); 6354 match(Set dst (VURShiftV src shift)); 6355 format %{ "vector_varshift_int $dst,$src,$shift\t!" %} 6356 ins_encode %{ 6357 assert(UseAVX >= 2, "required"); 6358 6359 int opcode = this->ideal_Opcode(); 6360 int vlen_enc = vector_length_encoding(this); 6361 __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6362 %} 6363 ins_pipe( pipe_slow ); 6364 %} 6365 6366 //Long variable shift 6367 instruct vshiftL_var(vec dst, vec src, vec shift) %{ 6368 predicate(vector_element_basic_type(n) == T_LONG); 6369 match(Set dst ( VLShiftV src shift)); 6370 match(Set dst (VURShiftV src shift)); 6371 format %{ "vector_varshift_long $dst,$src,$shift\t!" %} 6372 ins_encode %{ 6373 assert(UseAVX >= 2, "required"); 6374 6375 int opcode = this->ideal_Opcode(); 6376 int vlen_enc = vector_length_encoding(this); 6377 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6378 %} 6379 ins_pipe( pipe_slow ); 6380 %} 6381 6382 //Long variable right shift arithmetic 6383 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{ 6384 predicate(vector_length(n) <= 4 && vector_element_basic_type(n) == T_LONG && 6385 UseAVX == 2); 6386 match(Set dst (VRShiftV src shift)); 6387 effect(TEMP dst, TEMP vtmp); 6388 format %{ "vector_varshift_long $dst,$src,$shift\n\t! using $vtmp as TEMP" %} 6389 ins_encode %{ 6390 int opcode = this->ideal_Opcode(); 6391 int vlen_enc = vector_length_encoding(this); 6392 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, 6393 $vtmp$$XMMRegister); 6394 %} 6395 ins_pipe( pipe_slow ); 6396 %} 6397 6398 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{ 6399 predicate(vector_element_basic_type(n) == T_LONG && 6400 UseAVX > 2); 6401 match(Set dst (VRShiftV src shift)); 6402 format %{ "vector_varfshift_long $dst,$src,$shift\t!" %} 6403 ins_encode %{ 6404 int opcode = this->ideal_Opcode(); 6405 int vlen_enc = vector_length_encoding(this); 6406 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6407 %} 6408 ins_pipe( pipe_slow ); 6409 %} 6410 6411 // --------------------------------- AND -------------------------------------- 6412 6413 instruct vand(vec dst, vec src) %{ 6414 predicate(UseAVX == 0); 6415 match(Set dst (AndV dst src)); 6416 format %{ "pand $dst,$src\t! and vectors" %} 6417 ins_encode %{ 6418 __ pand($dst$$XMMRegister, $src$$XMMRegister); 6419 %} 6420 ins_pipe( pipe_slow ); 6421 %} 6817 match(Set dst (VectorCastD2X src)); 6818 format %{ "vector_cast_d2x $dst,$src\t!" %} 6819 ins_encode %{ 6820 int vlen_enc = vector_length_encoding(this, $src); 6821 __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); 6822 %} 6823 ins_pipe( pipe_slow ); 6824 %} 6825 6826 // --------------------------------- VectorMaskCmp -------------------------------------- 6827 6828 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{ 6829 predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1 6830 vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1 6831 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE 6832 match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); 6833 format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %} 6834 ins_encode %{ 6835 int vlen_enc = vector_length_encoding(this, $src1); 6836 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); 6837 if (vector_element_basic_type(this, $src1) == T_FLOAT) 6838 __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 6839 else 6840 __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 6841 %} 6842 ins_pipe( pipe_slow ); 6843 %} 6844 6845 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{ 6846 predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1 6847 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE 6848 match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); 6849 effect(TEMP scratch); 6850 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} 6851 ins_encode %{ 6852 int vlen_enc = Assembler::AVX_512bit; 6853 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); 6854 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. 6855 KRegister mask = k0; // The comparison itself is not being masked. 6856 if (vector_element_basic_type(this, $src1) == T_FLOAT) { 6857 __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 6858 __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); 6859 } else { 6860 __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 7199 7200 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{ 7201 match(Set dst (AbsVD src)); 7202 match(Set dst (NegVD src)); 7203 effect(TEMP scratch); 7204 format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %} 7205 ins_encode %{ 7206 int opcode = this->ideal_Opcode(); 7207 uint vlen = vector_length(this); 7208 if (vlen == 2) { 7209 assert(UseSSE >= 2, "required"); 7210 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register); 7211 } else { 7212 int vlen_enc = vector_length_encoding(this); 7213 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register); 7214 } 7215 %} 7216 ins_pipe( pipe_slow ); 7217 %} 7218 7219 //------------------------------------- NOT -------------------------------------------- 7220 7221 instruct vnotB(vec dst, vec src) %{ 7222 predicate(UseAVX == 0); 7223 match(Set dst (NotV src)); 7224 effect(TEMP dst); 7225 format %{ "vector_not $dst,$src\t!" %} 7226 ins_encode %{ 7227 int vlen = vector_length_in_bytes(this); 7228 switch(vlen) { 7229 default: 7230 assert(0, "Incorrect vector length"); 7231 break; 7232 case 4: { 7233 __ movdl($dst$$XMMRegister, ExternalAddress(vector_all_bits_set())); 7234 __ pxor($dst$$XMMRegister, $src$$XMMRegister); 7235 } break; 7236 case 8: { 7237 __ movq($dst$$XMMRegister, ExternalAddress(vector_all_bits_set())); 7238 __ pxor($dst$$XMMRegister, $src$$XMMRegister); 7239 } break; 7240 case 16: { 7241 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_all_bits_set())); 7242 __ pxor($dst$$XMMRegister, $src$$XMMRegister); 7243 } break; 7244 } 7245 %} 7246 ins_pipe( pipe_slow ); 7247 %} 7248 7249 instruct vnotB_reg(vec dst, vec src, rRegP scratch) %{ 7250 predicate(UseAVX > 0); 7251 match(Set dst (NotV src)); 7252 effect(TEMP scratch); 7253 format %{ "vector_not $dst,$src\t! using $scratch as rRegP" %} 7254 ins_encode %{ 7255 int vlen_enc = vector_length_encoding(this); 7256 __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vlen_enc, $scratch$$Register); 7257 %} 7258 ins_pipe( pipe_slow ); 7259 %} 7260 7261 //------------------------------------- VectorTest -------------------------------------------- 7262 7263 #ifdef _LP64 7264 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{ 7265 predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow); 7266 match(Set dst (VectorTest src1 src2 )); 7267 effect(KILL cr); 7268 format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %} 7269 ins_encode %{ 7270 int vlen = vector_length_in_bytes(this, $src1); 7271 int vlen_enc = vector_length_encoding(vlen); 7272 if (vlen <= 32) { 7273 if (UseAVX == 0) { 7274 assert(vlen <= 16, "required"); 7275 __ ptest($src1$$XMMRegister, $src2$$XMMRegister); 7276 } else { 7277 __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); 7278 } 7279 } else { 7280 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. | 1149 return NativeJump::instruction_size; 1150 } 1151 1152 #ifdef _LP64 1153 static uint size_deopt_handler() { 1154 // three 5 byte instructions plus one move for unreachable address. 1155 return 15+3; 1156 } 1157 #else 1158 static uint size_deopt_handler() { 1159 // NativeCall instruction size is the same as NativeJump. 1160 // exception handler starts out as jump and can be patched to 1161 // a call be deoptimization. (4932387) 1162 // Note that this value is also credited (in output.cpp) to 1163 // the size of the code section. 1164 return 5 + NativeJump::instruction_size; // pushl(); jmp; 1165 } 1166 #endif 1167 }; 1168 1169 1170 inline uint vector_length(const Node* n) { 1171 const TypeVect* vt = n->bottom_type()->is_vect(); 1172 return vt->length(); 1173 } 1174 1175 inline uint vector_length(const MachNode* use, MachOper* opnd) { 1176 uint def_idx = use->operand_index(opnd); 1177 Node* def = use->in(def_idx); 1178 return def->bottom_type()->is_vect()->length(); 1179 } 1180 1181 inline uint vector_length_in_bytes(const Node* n) { 1182 const TypeVect* vt = n->bottom_type()->is_vect(); 1183 return vt->length_in_bytes(); 1184 } 1185 1186 inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) { 1187 uint def_idx = use->operand_index(opnd); 1188 Node* def = use->in(def_idx); 1207 case 32: return Assembler::AVX_256bit; 1208 case 64: return Assembler::AVX_512bit; 1209 1210 default: { 1211 ShouldNotReachHere(); 1212 return Assembler::AVX_NoVec; 1213 } 1214 } 1215 } 1216 1217 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) { 1218 return vector_length_encoding(vector_length_in_bytes(n)); 1219 } 1220 1221 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) { 1222 uint def_idx = use->operand_index(opnd); 1223 Node* def = use->in(def_idx); 1224 return vector_length_encoding(def); 1225 } 1226 1227 class Node::PD { 1228 public: 1229 enum NodeFlags { 1230 Flag_intel_jcc_erratum = Node::_last_flag << 1, 1231 _last_flag = Flag_intel_jcc_erratum 1232 }; 1233 }; 1234 1235 %} // end source_hpp 1236 1237 source %{ 1238 1239 #include "opto/addnode.hpp" 1240 #include "c2_intelJccErratum_x86.hpp" 1241 1242 void PhaseOutput::pd_perform_mach_node_analysis() { 1243 if (VM_Version::has_intel_jcc_erratum()) { 1244 int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc()); 1245 _buf_sizes._code += extra_padding; 1246 } 1247 } 1248 1249 int MachNode::pd_alignment_required() const { 1250 if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) { 1251 // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86. 1252 return IntelJccErratum::largest_jcc_size() + 1; 1253 } else { 1254 return 1; 1488 case Op_CacheWB: 1489 case Op_CacheWBPreSync: 1490 case Op_CacheWBPostSync: 1491 if (!VM_Version::supports_data_cache_line_flush()) { 1492 return false; 1493 } 1494 break; 1495 case Op_ExtractB: 1496 case Op_ExtractL: 1497 case Op_ExtractI: 1498 case Op_RoundDoubleMode: 1499 if (UseSSE < 4) { 1500 return false; 1501 } 1502 break; 1503 case Op_RoundDoubleModeV: 1504 if (VM_Version::supports_avx() == false) { 1505 return false; // 128bit vroundpd is not available 1506 } 1507 break; 1508 case Op_LoadVectorGather: 1509 if (UseAVX < 2) { 1510 return false; 1511 } 1512 break; 1513 case Op_FmaVD: 1514 case Op_FmaVF: 1515 if (!UseFMA) { 1516 return false; 1517 } 1518 break; 1519 case Op_MacroLogicV: 1520 if (UseAVX < 3 || !UseVectorMacroLogic) { 1521 return false; 1522 } 1523 break; 1524 #ifndef _LP64 1525 case Op_AddReductionVF: 1526 case Op_AddReductionVD: 1527 case Op_MulReductionVF: 1528 case Op_MulReductionVD: 1529 if (UseSSE < 1) { // requires at least SSE 1530 return false; 1531 } 1532 break; 1533 case Op_MulAddVS2VI: 1534 case Op_RShiftVL: 1535 case Op_AbsVD: 1536 case Op_NegVD: 1537 if (UseSSE < 2) { 1538 return false; 1539 } 1540 break; 1541 #endif // !LP64 1542 } 1543 return true; // Match rules are supported by default. 1544 } 1545 1546 //------------------------------------------------------------------------ 1547 1548 // Identify extra cases that we might want to provide match rules for vector nodes and 1549 // other intrinsics guarded with vector length (vlen) and element type (bt). 1550 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { 1551 if (!match_rule_supported(opcode)) { 1552 return false; 1553 } 1554 // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes): 1555 // * SSE2 supports 128bit vectors for all types; 1556 // * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types; 1557 // * AVX2 supports 256bit vectors for all types; 1558 // * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types; 1559 // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types. 1560 // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE). 1561 // And MaxVectorSize is taken into account as well. 1562 if (!vector_size_supported(bt, vlen)) { 1563 return false; 1564 } 1565 // Special cases which require vector length follow: 1566 // * implementation limitations 1567 // * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ 1568 // * 128bit vroundpd instruction is present only in AVX1 1569 int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte; 1570 switch (opcode) { 1571 case Op_AbsVF: 1572 case Op_NegVF: 1573 if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) { 1574 return false; // 512bit vandps and vxorps are not available 1575 } 1576 break; 1577 case Op_AbsVD: 1578 case Op_NegVD: 1579 case Op_MulVL: 1580 if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) { 1581 return false; // 512bit vpmullq, vandpd and vxorpd are not available 1774 case MoveVec2Leg_rule: 1775 case MoveLeg2Vec_rule: 1776 return true; 1777 default: 1778 return false; 1779 } 1780 } 1781 1782 bool Matcher::is_generic_vector(MachOper* opnd) { 1783 switch (opnd->opcode()) { 1784 case VEC: 1785 case LEGVEC: 1786 return true; 1787 default: 1788 return false; 1789 } 1790 } 1791 1792 //------------------------------------------------------------------------ 1793 1794 bool Matcher::supports_vector_variable_shifts(void) { 1795 return (UseAVX >= 2); 1796 } 1797 1798 const bool Matcher::has_predicated_vectors(void) { 1799 bool ret_value = false; 1800 if (UseAVX > 2) { 1801 ret_value = VM_Version::supports_avx512vl(); 1802 } 1803 1804 return ret_value; 1805 } 1806 1807 const int Matcher::float_pressure(int default_pressure_threshold) { 1808 int float_pressure_threshold = default_pressure_threshold; 1809 #ifdef _LP64 1810 if (UseAVX > 2) { 1811 // Increase pressure threshold on machines with AVX3 which have 1812 // 2x more XMM registers. 1813 float_pressure_threshold = default_pressure_threshold * 2; 1814 } 1815 #endif 1816 return float_pressure_threshold; 1817 } 4176 %} 4177 4178 instruct ReplD_zero(vec dst, immD0 zero) %{ 4179 match(Set dst (ReplicateD zero)); 4180 format %{ "replicateD $dst,$zero" %} 4181 ins_encode %{ 4182 uint vlen = vector_length(this); 4183 if (vlen == 2) { 4184 __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); 4185 } else { 4186 int vlen_enc = vector_length_encoding(this); 4187 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ 4188 } 4189 %} 4190 ins_pipe( fpu_reg_reg ); 4191 %} 4192 4193 // ====================VECTOR INSERT======================================= 4194 4195 instruct insert(vec dst, rRegI val, immU8 idx) %{ 4196 predicate(vector_length_in_bytes(n) < 32); 4197 match(Set dst (VectorInsert (Binary dst val) idx)); 4198 format %{ "vector_insert $dst,$val,$idx" %} 4199 ins_encode %{ 4200 assert(UseSSE >= 4, "required"); 4201 assert(vector_length_in_bytes(this) >= 8, "required"); 4202 4203 BasicType elem_bt = vector_element_basic_type(this); 4204 4205 assert(is_integral_type(elem_bt), ""); 4206 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4207 4208 __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant); 4209 %} 4210 ins_pipe( pipe_slow ); 4211 %} 4212 4213 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{ 4214 predicate(vector_length_in_bytes(n) == 32); 4215 match(Set dst (VectorInsert (Binary src val) idx)); 4216 effect(TEMP vtmp); 4217 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4218 ins_encode %{ 4219 int vlen_enc = Assembler::AVX_256bit; 4220 BasicType elem_bt = vector_element_basic_type(this); 4221 int elem_per_lane = 16/type2aelembytes(elem_bt); 4222 int log2epr = log2(elem_per_lane); 4223 4224 assert(is_integral_type(elem_bt), "sanity"); 4225 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4226 4227 uint x_idx = $idx$$constant & right_n_bits(log2epr); 4228 uint y_idx = ($idx$$constant >> log2epr) & 1; 4229 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4230 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4231 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4232 %} 4233 ins_pipe( pipe_slow ); 4234 %} 4235 4236 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{ 4237 predicate(vector_length_in_bytes(n) == 64); 4238 match(Set dst (VectorInsert (Binary src val) idx)); 4239 effect(TEMP vtmp); 4240 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4241 ins_encode %{ 4242 assert(UseAVX > 2, "sanity"); 4243 4244 BasicType elem_bt = vector_element_basic_type(this); 4245 int elem_per_lane = 16/type2aelembytes(elem_bt); 4246 int log2epr = log2(elem_per_lane); 4247 4248 assert(is_integral_type(elem_bt), ""); 4249 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4250 4251 uint x_idx = $idx$$constant & right_n_bits(log2epr); 4252 uint y_idx = ($idx$$constant >> log2epr) & 3; 4253 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4254 __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4255 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4256 %} 4257 ins_pipe( pipe_slow ); 4258 %} 4259 4260 #ifdef _LP64 4261 instruct insert2L(vec dst, rRegL val, immU8 idx) %{ 4262 predicate(vector_length(n) == 2); 4263 match(Set dst (VectorInsert (Binary dst val) idx)); 4264 format %{ "vector_insert $dst,$val,$idx" %} 4265 ins_encode %{ 4266 assert(UseSSE >= 4, "required"); 4267 assert(vector_element_basic_type(this) == T_LONG, ""); 4268 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4269 4270 __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant); 4271 %} 4272 ins_pipe( pipe_slow ); 4273 %} 4274 4275 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{ 4276 predicate(vector_length(n) == 4); 4277 match(Set dst (VectorInsert (Binary src val) idx)); 4278 effect(TEMP vtmp); 4279 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4280 ins_encode %{ 4281 assert(vector_element_basic_type(this) == T_LONG, ""); 4282 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4283 4284 uint x_idx = $idx$$constant & right_n_bits(1); 4285 uint y_idx = ($idx$$constant >> 1) & 1; 4286 int vlen_enc = Assembler::AVX_256bit; 4287 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4288 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4289 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4290 %} 4291 ins_pipe( pipe_slow ); 4292 %} 4293 4294 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{ 4295 predicate(vector_length(n) == 8); 4296 match(Set dst (VectorInsert (Binary src val) idx)); 4297 effect(TEMP vtmp); 4298 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4299 ins_encode %{ 4300 assert(vector_element_basic_type(this) == T_LONG, "sanity"); 4301 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4302 4303 uint x_idx = $idx$$constant & right_n_bits(1); 4304 uint y_idx = ($idx$$constant >> 1) & 3; 4305 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4306 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); 4307 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4308 %} 4309 ins_pipe( pipe_slow ); 4310 %} 4311 #endif 4312 4313 instruct insertF(vec dst, regF val, immU8 idx) %{ 4314 predicate(vector_length(n) < 8); 4315 match(Set dst (VectorInsert (Binary dst val) idx)); 4316 format %{ "vector_insert $dst,$val,$idx" %} 4317 ins_encode %{ 4318 assert(UseSSE >= 4, "sanity"); 4319 4320 assert(vector_element_basic_type(this) == T_FLOAT, "sanity"); 4321 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4322 4323 __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant); 4324 %} 4325 ins_pipe( pipe_slow ); 4326 %} 4327 4328 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{ 4329 predicate(vector_length(n) >= 8); 4330 match(Set dst (VectorInsert (Binary src val) idx)); 4331 effect(TEMP vtmp); 4332 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4333 ins_encode %{ 4334 assert(vector_element_basic_type(this) == T_FLOAT, "sanity"); 4335 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4336 4337 int vlen = vector_length(this); 4338 uint x_idx = $idx$$constant & right_n_bits(2); 4339 if (vlen == 8) { 4340 uint y_idx = ($idx$$constant >> 2) & 1; 4341 int vlen_enc = Assembler::AVX_256bit; 4342 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4343 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx); 4344 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4345 } else { 4346 assert(vlen == 16, "sanity"); 4347 uint y_idx = ($idx$$constant >> 2) & 3; 4348 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4349 __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx); 4350 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4351 } 4352 %} 4353 ins_pipe( pipe_slow ); 4354 %} 4355 4356 #ifdef _LP64 4357 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{ 4358 predicate(vector_length(n) == 2); 4359 match(Set dst (VectorInsert (Binary dst val) idx)); 4360 effect(TEMP tmp); 4361 format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %} 4362 ins_encode %{ 4363 assert(UseSSE >= 4, "sanity"); 4364 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); 4365 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4366 4367 __ movq($tmp$$Register, $val$$XMMRegister); 4368 __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant); 4369 %} 4370 ins_pipe( pipe_slow ); 4371 %} 4372 4373 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{ 4374 predicate(vector_length(n) == 4); 4375 match(Set dst (VectorInsert (Binary src val) idx)); 4376 effect(TEMP vtmp, TEMP tmp); 4377 format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %} 4378 ins_encode %{ 4379 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); 4380 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4381 4382 uint x_idx = $idx$$constant & right_n_bits(1); 4383 uint y_idx = ($idx$$constant >> 1) & 1; 4384 int vlen_enc = Assembler::AVX_256bit; 4385 __ movq($tmp$$Register, $val$$XMMRegister); 4386 __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4387 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx); 4388 __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4389 %} 4390 ins_pipe( pipe_slow ); 4391 %} 4392 4393 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{ 4394 predicate(vector_length(n) == 8); 4395 match(Set dst (VectorInsert (Binary src val) idx)); 4396 effect(TEMP tmp, TEMP vtmp); 4397 format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} 4398 ins_encode %{ 4399 assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); 4400 assert($idx$$constant < (int)vector_length(this), "out of bounds"); 4401 4402 uint x_idx = $idx$$constant & right_n_bits(1); 4403 uint y_idx = ($idx$$constant >> 1) & 3; 4404 __ movq($tmp$$Register, $val$$XMMRegister); 4405 __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); 4406 __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx); 4407 __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx); 4408 %} 4409 ins_pipe( pipe_slow ); 4410 %} 4411 #endif 4412 4413 // ====================REDUCTION ARITHMETIC======================================= 4414 4415 // =======================Int Reduction========================================== 4416 4417 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ 4418 predicate(vector_element_basic_type(n->in(2)) == T_INT && 4419 vector_length(n->in(2)) < 16); // src2 4420 match(Set dst (AddReductionVI src1 src2)); 4421 match(Set dst (MulReductionVI src1 src2)); 4422 match(Set dst (AndReductionV src1 src2)); 4423 match(Set dst ( OrReductionV src1 src2)); 4424 match(Set dst (XorReductionV src1 src2)); 4425 match(Set dst (MinReductionV src1 src2)); 4426 match(Set dst (MaxReductionV src1 src2)); 4427 effect(TEMP vtmp1, TEMP vtmp2); 5886 %} 5887 ins_pipe( pipe_slow ); 5888 %} 5889 5890 // ------------------------------ Shift --------------------------------------- 5891 5892 // Left and right shift count vectors are the same on x86 5893 // (only lowest bits of xmm reg are used for count). 5894 instruct vshiftcnt(vec dst, rRegI cnt) %{ 5895 match(Set dst (LShiftCntV cnt)); 5896 match(Set dst (RShiftCntV cnt)); 5897 format %{ "movdl $dst,$cnt\t! load shift count" %} 5898 ins_encode %{ 5899 __ movdl($dst$$XMMRegister, $cnt$$Register); 5900 %} 5901 ins_pipe( pipe_slow ); 5902 %} 5903 5904 // Byte vector shift 5905 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 5906 predicate(vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2))); 5907 match(Set dst ( LShiftVB src shift)); 5908 match(Set dst ( RShiftVB src shift)); 5909 match(Set dst (URShiftVB src shift)); 5910 effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch); 5911 format %{"vector_byte_shift $dst,$src,$shift" %} 5912 ins_encode %{ 5913 assert(UseSSE > 3, "required"); 5914 int opcode = this->ideal_Opcode(); 5915 bool sign = (opcode != Op_URShiftVB); 5916 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister); 5917 __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister); 5918 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 5919 __ pand($dst$$XMMRegister, $tmp$$XMMRegister); 5920 __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); 5921 %} 5922 ins_pipe( pipe_slow ); 5923 %} 5924 5925 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{ 5926 predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) && 5927 UseAVX <= 1); 5928 match(Set dst ( LShiftVB src shift)); 5929 match(Set dst ( RShiftVB src shift)); 5930 match(Set dst (URShiftVB src shift)); 5931 effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch); 5932 format %{"vector_byte_shift $dst,$src,$shift" %} 5933 ins_encode %{ 5934 assert(UseSSE > 3, "required"); 5935 int opcode = this->ideal_Opcode(); 5936 bool sign = (opcode != Op_URShiftVB); 5937 __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister); 5938 __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister); 5939 __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE); 5940 __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister); 5941 __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister); 5942 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 5943 __ pand($tmp2$$XMMRegister, $dst$$XMMRegister); 5944 __ pand($dst$$XMMRegister, $tmp1$$XMMRegister); 5945 __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister); 5946 %} 5947 ins_pipe( pipe_slow ); 5948 %} 5949 5950 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 5951 predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) && 5952 UseAVX > 1); 5953 match(Set dst ( LShiftVB src shift)); 5954 match(Set dst ( RShiftVB src shift)); 5955 match(Set dst (URShiftVB src shift)); 5956 effect(TEMP dst, TEMP tmp, TEMP scratch); 5957 format %{"vector_byte_shift $dst,$src,$shift" %} 5958 ins_encode %{ 5959 int opcode = this->ideal_Opcode(); 5960 bool sign = (opcode != Op_URShiftVB); 5961 int vlen_enc = Assembler::AVX_256bit; 5962 __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc); 5963 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); 5964 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); 5965 __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister); 5966 __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0); 5967 %} 5968 ins_pipe( pipe_slow ); 5969 %} 5970 5971 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 5972 predicate(vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2))); 5973 match(Set dst ( LShiftVB src shift)); 5974 match(Set dst ( RShiftVB src shift)); 5975 match(Set dst (URShiftVB src shift)); 5976 effect(TEMP dst, TEMP tmp, TEMP scratch); 5977 format %{"vector_byte_shift $dst,$src,$shift" %} 5978 ins_encode %{ 5979 assert(UseAVX > 1, "required"); 5980 int opcode = this->ideal_Opcode(); 5981 bool sign = (opcode != Op_URShiftVB); 5982 int vlen_enc = Assembler::AVX_256bit; 5983 __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister); 5984 __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 5985 __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc); 5986 __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); 5987 __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc); 5988 __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); 5989 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); 5990 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 5991 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); 5992 %} 5993 ins_pipe( pipe_slow ); 5994 %} 5995 5996 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{ 5997 predicate(vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2))); 5998 match(Set dst ( LShiftVB src shift)); 5999 match(Set dst (RShiftVB src shift)); 6000 match(Set dst (URShiftVB src shift)); 6001 effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); 6002 format %{"vector_byte_shift $dst,$src,$shift" %} 6003 ins_encode %{ 6004 assert(UseAVX > 2, "required"); 6005 int opcode = this->ideal_Opcode(); 6006 bool sign = (opcode != Op_URShiftVB); 6007 int vlen_enc = Assembler::AVX_512bit; 6008 __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1); 6009 __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); 6010 __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc); 6011 __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6012 __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6013 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 6014 __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6015 __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6016 __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6017 __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc); 6018 __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register); 6019 __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6020 %} 6021 ins_pipe( pipe_slow ); 6022 %} 6023 6024 // Shorts vector logical right shift produces incorrect Java result 6025 // for negative data because java code convert short value into int with 6026 // sign extension before a shift. But char vectors are fine since chars are 6027 // unsigned values. 6028 // Shorts/Chars vector left shift 6029 instruct vshiftS(vec dst, vec src, vec shift) %{ 6030 predicate(VectorNode::is_vshift_cnt(n->in(2))); 6031 match(Set dst ( LShiftVS src shift)); 6032 match(Set dst ( RShiftVS src shift)); 6033 match(Set dst (URShiftVS src shift)); 6034 effect(TEMP dst, USE src, USE shift); 6035 format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %} 6036 ins_encode %{ 6037 int opcode = this->ideal_Opcode(); 6038 if (UseAVX > 0) { 6039 int vlen_enc = vector_length_encoding(this); 6040 __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6041 } else { 6042 int vlen = vector_length(this); 6043 if (vlen == 2) { 6044 __ movflt($dst$$XMMRegister, $src$$XMMRegister); 6045 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6046 } else if (vlen == 4) { 6047 __ movdbl($dst$$XMMRegister, $src$$XMMRegister); 6048 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6049 } else { 6050 assert (vlen == 8, "sanity"); 6051 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6052 __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6053 } 6054 } 6055 %} 6056 ins_pipe( pipe_slow ); 6057 %} 6058 6059 // Integers vector left shift 6060 instruct vshiftI(vec dst, vec src, vec shift) %{ 6061 predicate(VectorNode::is_vshift_cnt(n->in(2))); 6062 match(Set dst ( LShiftVI src shift)); 6063 match(Set dst ( RShiftVI src shift)); 6064 match(Set dst (URShiftVI src shift)); 6065 effect(TEMP dst, USE src, USE shift); 6066 format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %} 6067 ins_encode %{ 6068 int opcode = this->ideal_Opcode(); 6069 if (UseAVX > 0) { 6070 int vlen_enc = vector_length_encoding(this); 6071 __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6072 } else { 6073 int vlen = vector_length(this); 6074 if (vlen == 2) { 6075 __ movdbl($dst$$XMMRegister, $src$$XMMRegister); 6076 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6077 } else { 6078 assert(vlen == 4, "sanity"); 6079 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6080 __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6081 } 6082 } 6083 %} 6084 ins_pipe( pipe_slow ); 6085 %} 6086 6087 // Longs vector shift 6088 instruct vshiftL(vec dst, vec src, vec shift) %{ 6089 predicate(VectorNode::is_vshift_cnt(n->in(2))); 6090 match(Set dst ( LShiftVL src shift)); 6091 match(Set dst (URShiftVL src shift)); 6092 effect(TEMP dst, USE src, USE shift); 6093 format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %} 6094 ins_encode %{ 6095 int opcode = this->ideal_Opcode(); 6096 if (UseAVX > 0) { 6097 int vlen_enc = vector_length_encoding(this); 6098 __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6099 } else { 6100 assert(vector_length(this) == 2, ""); 6101 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6102 __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister); 6103 } 6104 %} 6105 ins_pipe( pipe_slow ); 6106 %} 6107 6108 // -------------------ArithmeticRightShift ----------------------------------- 6109 // Long vector arithmetic right shift 6110 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ 6111 predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2); 6112 match(Set dst (RShiftVL src shift)); 6113 effect(TEMP dst, TEMP tmp, TEMP scratch); 6114 format %{ "vshiftq $dst,$src,$shift" %} 6115 ins_encode %{ 6116 uint vlen = vector_length(this); 6117 if (vlen == 2) { 6118 assert(UseSSE >= 2, "required"); 6119 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 6120 __ psrlq($dst$$XMMRegister, $shift$$XMMRegister); 6121 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); 6122 __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister); 6123 __ pxor($dst$$XMMRegister, $tmp$$XMMRegister); 6124 __ psubq($dst$$XMMRegister, $tmp$$XMMRegister); 6125 } else { 6126 assert(vlen == 4, "sanity"); 6127 assert(UseAVX > 1, "required"); 6128 int vlen_enc = Assembler::AVX_256bit; 6129 __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6130 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); 6131 __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6132 __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 6133 __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); 6134 } 6135 %} 6136 ins_pipe( pipe_slow ); 6137 %} 6138 6139 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{ 6140 predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2); 6141 match(Set dst (RShiftVL src shift)); 6142 format %{ "vshiftq $dst,$src,$shift" %} 6143 ins_encode %{ 6144 int vlen_enc = vector_length_encoding(this); 6145 __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6146 %} 6147 ins_pipe( pipe_slow ); 6148 %} 6149 6150 // ------------------- Variable Shift ----------------------------- 6151 // Byte variable shift 6152 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ 6153 predicate(vector_length(n) <= 8 && 6154 !VectorNode::is_vshift_cnt(n->in(2)) && 6155 !VM_Version::supports_avx512bw()); 6156 match(Set dst ( LShiftVB src shift)); 6157 match(Set dst ( RShiftVB src shift)); 6158 match(Set dst (URShiftVB src shift)); 6159 effect(TEMP dst, TEMP vtmp, TEMP scratch); 6160 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %} 6161 ins_encode %{ 6162 assert(UseAVX >= 2, "required"); 6163 6164 int opcode = this->ideal_Opcode(); 6165 int vlen_enc = Assembler::AVX_128bit; 6166 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register); 6167 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0); 6168 %} 6169 ins_pipe( pipe_slow ); 6170 %} 6171 6172 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ 6173 predicate(vector_length(n) == 16 && 6174 !VectorNode::is_vshift_cnt(n->in(2)) && 6175 !VM_Version::supports_avx512bw()); 6176 match(Set dst ( LShiftVB src shift)); 6177 match(Set dst ( RShiftVB src shift)); 6178 match(Set dst (URShiftVB src shift)); 6179 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); 6180 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %} 6181 ins_encode %{ 6182 assert(UseAVX >= 2, "required"); 6183 6184 int opcode = this->ideal_Opcode(); 6185 int vlen_enc = Assembler::AVX_128bit; 6186 // Shift lower half and get word result in dst 6187 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); 6188 6189 // Shift upper half and get word result in vtmp1 6190 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0); 6191 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0); 6192 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6193 6194 // Merge and down convert the two word results to byte in dst 6195 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0); 6196 %} 6197 ins_pipe( pipe_slow ); 6198 %} 6199 6200 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{ 6201 predicate(vector_length(n) == 32 && 6202 !VectorNode::is_vshift_cnt(n->in(2)) && 6203 !VM_Version::supports_avx512bw()); 6204 match(Set dst ( LShiftVB src shift)); 6205 match(Set dst ( RShiftVB src shift)); 6206 match(Set dst (URShiftVB src shift)); 6207 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch); 6208 format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %} 6209 ins_encode %{ 6210 assert(UseAVX >= 2, "required"); 6211 6212 int opcode = this->ideal_Opcode(); 6213 int vlen_enc = Assembler::AVX_128bit; 6214 // Process lower 128 bits and get result in dst 6215 __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); 6216 __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0); 6217 __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0); 6218 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6219 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0); 6220 6221 // Process higher 128 bits and get result in vtmp3 6222 __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister); 6223 __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister); 6224 __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register); 6225 __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0); 6226 __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0); 6227 __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6228 __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0); 6229 6230 // Merge the two results in dst 6231 __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1); 6232 %} 6233 ins_pipe( pipe_slow ); 6234 %} 6235 6236 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ 6237 predicate(vector_length(n) <= 32 && 6238 !VectorNode::is_vshift_cnt(n->in(2)) && 6239 VM_Version::supports_avx512bw()); 6240 match(Set dst ( LShiftVB src shift)); 6241 match(Set dst ( RShiftVB src shift)); 6242 match(Set dst (URShiftVB src shift)); 6243 effect(TEMP dst, TEMP vtmp, TEMP scratch); 6244 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %} 6245 ins_encode %{ 6246 assert(UseAVX > 2, "required"); 6247 6248 int opcode = this->ideal_Opcode(); 6249 int vlen_enc = vector_length_encoding(this); 6250 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register); 6251 %} 6252 ins_pipe( pipe_slow ); 6253 %} 6254 6255 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ 6256 predicate(vector_length(n) == 64 && 6257 !VectorNode::is_vshift_cnt(n->in(2)) && 6258 VM_Version::supports_avx512bw()); 6259 match(Set dst ( LShiftVB src shift)); 6260 match(Set dst ( RShiftVB src shift)); 6261 match(Set dst (URShiftVB src shift)); 6262 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); 6263 format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %} 6264 ins_encode %{ 6265 assert(UseAVX > 2, "required"); 6266 6267 int opcode = this->ideal_Opcode(); 6268 int vlen_enc = Assembler::AVX_256bit; 6269 __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); 6270 __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister); 6271 __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister); 6272 __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); 6273 __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1); 6274 %} 6275 ins_pipe( pipe_slow ); 6276 %} 6277 6278 // Short variable shift 6279 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ 6280 predicate(vector_length(n) <= 8 && 6281 !VectorNode::is_vshift_cnt(n->in(2)) && 6282 !VM_Version::supports_avx512bw()); 6283 match(Set dst ( LShiftVS src shift)); 6284 match(Set dst ( RShiftVS src shift)); 6285 match(Set dst (URShiftVS src shift)); 6286 effect(TEMP dst, TEMP vtmp, TEMP scratch); 6287 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %} 6288 ins_encode %{ 6289 assert(UseAVX >= 2, "required"); 6290 6291 int opcode = this->ideal_Opcode(); 6292 bool sign = (opcode != Op_URShiftVS); 6293 int vlen_enc = Assembler::AVX_256bit; 6294 __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1); 6295 __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1); 6296 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); 6297 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); 6298 __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister); 6299 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0); 6300 %} 6301 ins_pipe( pipe_slow ); 6302 %} 6303 6304 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ 6305 predicate(vector_length(n) == 16 && 6306 !VectorNode::is_vshift_cnt(n->in(2)) && 6307 !VM_Version::supports_avx512bw()); 6308 match(Set dst ( LShiftVS src shift)); 6309 match(Set dst ( RShiftVS src shift)); 6310 match(Set dst (URShiftVS src shift)); 6311 effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); 6312 format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %} 6313 ins_encode %{ 6314 assert(UseAVX >= 2, "required"); 6315 6316 int opcode = this->ideal_Opcode(); 6317 bool sign = (opcode != Op_URShiftVS); 6318 int vlen_enc = Assembler::AVX_256bit; 6319 // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP 6320 __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc); 6321 __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6322 __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); 6323 __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); 6324 6325 // Shift upper half, with result in dst usign vtmp1 as TEMP 6326 __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister); 6327 __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister); 6328 __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6329 __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); 6330 __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); 6331 __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); 6332 6333 // Merge lower and upper half result into dst 6334 __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); 6335 __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); 6336 %} 6337 ins_pipe( pipe_slow ); 6338 %} 6339 6340 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{ 6341 predicate(!VectorNode::is_vshift_cnt(n->in(2)) && 6342 VM_Version::supports_avx512bw()); 6343 match(Set dst ( LShiftVS src shift)); 6344 match(Set dst ( RShiftVS src shift)); 6345 match(Set dst (URShiftVS src shift)); 6346 format %{ "vector_varshift_short $dst,$src,$shift\t!" %} 6347 ins_encode %{ 6348 assert(UseAVX > 2, "required"); 6349 6350 int opcode = this->ideal_Opcode(); 6351 int vlen_enc = vector_length_encoding(this); 6352 if (!VM_Version::supports_avx512vl()) { 6353 vlen_enc = Assembler::AVX_512bit; 6354 } 6355 __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6356 %} 6357 ins_pipe( pipe_slow ); 6358 %} 6359 6360 //Integer variable shift 6361 instruct vshiftI_var(vec dst, vec src, vec shift) %{ 6362 predicate(!VectorNode::is_vshift_cnt(n->in(2))); 6363 match(Set dst ( LShiftVI src shift)); 6364 match(Set dst ( RShiftVI src shift)); 6365 match(Set dst (URShiftVI src shift)); 6366 format %{ "vector_varshift_int $dst,$src,$shift\t!" %} 6367 ins_encode %{ 6368 assert(UseAVX >= 2, "required"); 6369 6370 int opcode = this->ideal_Opcode(); 6371 int vlen_enc = vector_length_encoding(this); 6372 __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6373 %} 6374 ins_pipe( pipe_slow ); 6375 %} 6376 6377 //Long variable shift 6378 instruct vshiftL_var(vec dst, vec src, vec shift) %{ 6379 predicate(!VectorNode::is_vshift_cnt(n->in(2))); 6380 match(Set dst ( LShiftVL src shift)); 6381 match(Set dst (URShiftVL src shift)); 6382 format %{ "vector_varshift_long $dst,$src,$shift\t!" %} 6383 ins_encode %{ 6384 assert(UseAVX >= 2, "required"); 6385 6386 int opcode = this->ideal_Opcode(); 6387 int vlen_enc = vector_length_encoding(this); 6388 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6389 %} 6390 ins_pipe( pipe_slow ); 6391 %} 6392 6393 //Long variable right shift arithmetic 6394 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{ 6395 predicate(vector_length(n) <= 4 && 6396 !VectorNode::is_vshift_cnt(n->in(2)) && 6397 UseAVX == 2); 6398 match(Set dst (RShiftVL src shift)); 6399 effect(TEMP dst, TEMP vtmp); 6400 format %{ "vector_varshift_long $dst,$src,$shift\n\t! using $vtmp as TEMP" %} 6401 ins_encode %{ 6402 int opcode = this->ideal_Opcode(); 6403 int vlen_enc = vector_length_encoding(this); 6404 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, 6405 $vtmp$$XMMRegister); 6406 %} 6407 ins_pipe( pipe_slow ); 6408 %} 6409 6410 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{ 6411 predicate(!VectorNode::is_vshift_cnt(n->in(2)) && 6412 UseAVX > 2); 6413 match(Set dst (RShiftVL src shift)); 6414 format %{ "vector_varfshift_long $dst,$src,$shift\t!" %} 6415 ins_encode %{ 6416 int opcode = this->ideal_Opcode(); 6417 int vlen_enc = vector_length_encoding(this); 6418 __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); 6419 %} 6420 ins_pipe( pipe_slow ); 6421 %} 6422 6423 // --------------------------------- AND -------------------------------------- 6424 6425 instruct vand(vec dst, vec src) %{ 6426 predicate(UseAVX == 0); 6427 match(Set dst (AndV dst src)); 6428 format %{ "pand $dst,$src\t! and vectors" %} 6429 ins_encode %{ 6430 __ pand($dst$$XMMRegister, $src$$XMMRegister); 6431 %} 6432 ins_pipe( pipe_slow ); 6433 %} 6829 match(Set dst (VectorCastD2X src)); 6830 format %{ "vector_cast_d2x $dst,$src\t!" %} 6831 ins_encode %{ 6832 int vlen_enc = vector_length_encoding(this, $src); 6833 __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); 6834 %} 6835 ins_pipe( pipe_slow ); 6836 %} 6837 6838 // --------------------------------- VectorMaskCmp -------------------------------------- 6839 6840 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{ 6841 predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1 6842 vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1 6843 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE 6844 match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); 6845 format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %} 6846 ins_encode %{ 6847 int vlen_enc = vector_length_encoding(this, $src1); 6848 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); 6849 if (vector_element_basic_type(this, $src1) == T_FLOAT) { 6850 __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 6851 } else { 6852 __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 6853 } 6854 %} 6855 ins_pipe( pipe_slow ); 6856 %} 6857 6858 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{ 6859 predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1 6860 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE 6861 match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); 6862 effect(TEMP scratch); 6863 format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} 6864 ins_encode %{ 6865 int vlen_enc = Assembler::AVX_512bit; 6866 Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); 6867 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. 6868 KRegister mask = k0; // The comparison itself is not being masked. 6869 if (vector_element_basic_type(this, $src1) == T_FLOAT) { 6870 __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 6871 __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); 6872 } else { 6873 __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); 7212 7213 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{ 7214 match(Set dst (AbsVD src)); 7215 match(Set dst (NegVD src)); 7216 effect(TEMP scratch); 7217 format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %} 7218 ins_encode %{ 7219 int opcode = this->ideal_Opcode(); 7220 uint vlen = vector_length(this); 7221 if (vlen == 2) { 7222 assert(UseSSE >= 2, "required"); 7223 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register); 7224 } else { 7225 int vlen_enc = vector_length_encoding(this); 7226 __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register); 7227 } 7228 %} 7229 ins_pipe( pipe_slow ); 7230 %} 7231 7232 //------------------------------------- VectorTest -------------------------------------------- 7233 7234 #ifdef _LP64 7235 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{ 7236 predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow); 7237 match(Set dst (VectorTest src1 src2 )); 7238 effect(KILL cr); 7239 format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %} 7240 ins_encode %{ 7241 int vlen = vector_length_in_bytes(this, $src1); 7242 int vlen_enc = vector_length_encoding(vlen); 7243 if (vlen <= 32) { 7244 if (UseAVX == 0) { 7245 assert(vlen <= 16, "required"); 7246 __ ptest($src1$$XMMRegister, $src2$$XMMRegister); 7247 } else { 7248 __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); 7249 } 7250 } else { 7251 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. |