< prev index next >

src/hotspot/cpu/x86/x86.ad

Print this page




1149     return NativeJump::instruction_size;
1150   }
1151 
1152 #ifdef _LP64
1153   static uint size_deopt_handler() {
1154     // three 5 byte instructions plus one move for unreachable address.
1155     return 15+3;
1156   }
1157 #else
1158   static uint size_deopt_handler() {
1159     // NativeCall instruction size is the same as NativeJump.
1160     // exception handler starts out as jump and can be patched to
1161     // a call be deoptimization.  (4932387)
1162     // Note that this value is also credited (in output.cpp) to
1163     // the size of the code section.
1164     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1165   }
1166 #endif
1167 };
1168 
1169 class Node::PD {
1170 public:
1171   enum NodeFlags {
1172     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1173     _last_flag             = Flag_intel_jcc_erratum
1174   };
1175 };
1176 
1177 
1178 inline uint vector_length(const Node* n) {
1179   const TypeVect* vt = n->bottom_type()->is_vect();
1180   return vt->length();
1181 }
1182 
1183 inline uint vector_length(const MachNode* use, MachOper* opnd) {
1184   uint def_idx = use->operand_index(opnd);
1185   Node* def = use->in(def_idx);
1186   return def->bottom_type()->is_vect()->length();
1187 }
1188 
1189 inline uint vector_length_in_bytes(const Node* n) {
1190   const TypeVect* vt = n->bottom_type()->is_vect();
1191   return vt->length_in_bytes();
1192 }
1193 
1194 inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1195   uint def_idx = use->operand_index(opnd);
1196   Node* def = use->in(def_idx);


1215     case 32: return Assembler::AVX_256bit;
1216     case 64: return Assembler::AVX_512bit;
1217 
1218     default: {
1219       ShouldNotReachHere();
1220       return Assembler::AVX_NoVec;
1221     }
1222   }
1223 }
1224 
1225 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1226   return vector_length_encoding(vector_length_in_bytes(n));
1227 }
1228 
1229 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1230   uint def_idx = use->operand_index(opnd);
1231   Node* def = use->in(def_idx);
1232   return vector_length_encoding(def);
1233 }
1234 








1235 %} // end source_hpp
1236 
1237 source %{
1238 
1239 #include "opto/addnode.hpp"
1240 #include "c2_intelJccErratum_x86.hpp"
1241 
1242 void PhaseOutput::pd_perform_mach_node_analysis() {
1243   if (VM_Version::has_intel_jcc_erratum()) {
1244     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1245     _buf_sizes._code += extra_padding;
1246   }
1247 }
1248 
1249 int MachNode::pd_alignment_required() const {
1250   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1251     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1252     return IntelJccErratum::largest_jcc_size() + 1;
1253   } else {
1254     return 1;


1488     case Op_CacheWB:
1489     case Op_CacheWBPreSync:
1490     case Op_CacheWBPostSync:
1491       if (!VM_Version::supports_data_cache_line_flush()) {
1492         return false;
1493       }
1494       break;
1495     case Op_ExtractB:
1496     case Op_ExtractL:
1497     case Op_ExtractI:
1498     case Op_RoundDoubleMode:
1499       if (UseSSE < 4) {
1500         return false;
1501       }
1502       break;
1503     case Op_RoundDoubleModeV:
1504       if (VM_Version::supports_avx() == false) {
1505         return false; // 128bit vroundpd is not available
1506       }
1507       break;
1508     case Op_MacroLogicV:
1509       if (UseAVX < 3 || !UseVectorMacroLogic) {
1510         return false;
1511       }
1512       break;
1513     case Op_VLShiftV:
1514     case Op_VRShiftV:
1515     case Op_VURShiftV:
1516     case Op_LoadVectorGather:
1517       if (UseAVX < 2) {
1518         return false;
1519       }
1520       break;
1521     case Op_FmaVD:
1522     case Op_FmaVF:
1523       if (!UseFMA) {
1524         return false;
1525       }
1526       break;





1527 #ifndef _LP64
1528     case Op_AddReductionVF:
1529     case Op_AddReductionVD:
1530     case Op_MulReductionVF:
1531     case Op_MulReductionVD:
1532       if (UseSSE < 1) { // requires at least SSE
1533         return false;
1534       }
1535       break;
1536     case Op_MulAddVS2VI:
1537     case Op_RShiftVL:
1538     case Op_AbsVD:
1539     case Op_NegVD:
1540       if (UseSSE < 2) {
1541         return false;
1542       }
1543       break;
1544 #endif // !LP64
1545   }
1546   return true;  // Match rules are supported by default.
1547 }
1548 
1549 //------------------------------------------------------------------------
1550 
1551 // Identify extra cases that we might want to provide match rules for vector nodes and
1552 // other intrinsics guarded with vector length (vlen) and element type (bt).
1553 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1554   if (!match_rule_supported(opcode)) {
1555     return false;
1556   }
1557   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1558   //   * SSE2 supports 128bit vectors for all types;
1559   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1560   //   * AVX2 supports 256bit vectors for all types;
1561   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1562   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1563   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1564   // And MaxVectorSize is taken into account as well.
1565 
1566   if (!vector_size_supported(bt, vlen)) {
1567     return false;
1568   }
1569   // Special cases which require vector length follow:
1570   //   * implementation limitations
1571   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1572   //   * 128bit vroundpd instruction is present only in AVX1
1573   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1574   switch (opcode) {
1575     case Op_AbsVF:
1576     case Op_NegVF:
1577       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1578         return false; // 512bit vandps and vxorps are not available
1579       }
1580       break;
1581     case Op_AbsVD:
1582     case Op_NegVD:
1583     case Op_MulVL:
1584       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1585         return false; // 512bit vpmullq, vandpd and vxorpd are not available


1778     case MoveVec2Leg_rule:
1779     case MoveLeg2Vec_rule:
1780       return true;
1781     default:
1782       return false;
1783   }
1784 }
1785 
1786 bool Matcher::is_generic_vector(MachOper* opnd) {
1787   switch (opnd->opcode()) {
1788     case VEC:
1789     case LEGVEC:
1790       return true;
1791     default:
1792       return false;
1793   }
1794 }
1795 
1796 //------------------------------------------------------------------------
1797 




1798 const bool Matcher::has_predicated_vectors(void) {
1799   bool ret_value = false;
1800   if (UseAVX > 2) {
1801     ret_value = VM_Version::supports_avx512vl();
1802   }
1803 
1804   return ret_value;
1805 }
1806 
1807 const int Matcher::float_pressure(int default_pressure_threshold) {
1808   int float_pressure_threshold = default_pressure_threshold;
1809 #ifdef _LP64
1810   if (UseAVX > 2) {
1811     // Increase pressure threshold on machines with AVX3 which have
1812     // 2x more XMM registers.
1813     float_pressure_threshold = default_pressure_threshold * 2;
1814   }
1815 #endif
1816   return float_pressure_threshold;
1817 }


4176 %}
4177 
4178 instruct ReplD_zero(vec dst, immD0 zero) %{
4179   match(Set dst (ReplicateD zero));
4180   format %{ "replicateD $dst,$zero" %}
4181   ins_encode %{
4182     uint vlen = vector_length(this);
4183     if (vlen == 2) {
4184       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4185     } else {
4186       int vlen_enc = vector_length_encoding(this);
4187       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4188     }
4189   %}
4190   ins_pipe( fpu_reg_reg );
4191 %}
4192 
4193 // ====================VECTOR INSERT=======================================
4194 
4195 instruct insert(vec dst, rRegI val, immU8 idx) %{
4196   predicate(vector_length_in_bytes(n) >=  8 &&
4197             vector_length_in_bytes(n) <= 16);
4198   match(Set dst (VectorInsert (Binary dst val) idx));
4199   format %{ "vector_insert $dst,$val,$idx" %}
4200   ins_encode %{
4201     assert(UseSSE >= 4, "required");

4202 
4203     BasicType elem_bt = vector_element_basic_type(this);
4204 
4205     assert(is_integral_type(elem_bt), "");
4206     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4207 
4208     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4209   %}
4210   ins_pipe( pipe_slow );
4211 %}
4212 
4213 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4214   predicate(vector_length_in_bytes(n) == 32);
4215   match(Set dst (VectorInsert (Binary src val) idx));
4216   effect(TEMP vtmp);
4217   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4218   ins_encode %{
4219     int vlen_enc = Assembler::AVX_256bit;
4220     BasicType elem_bt = vector_element_basic_type(this);
4221     int elem_per_lane = 16/type2aelembytes(elem_bt);
4222     int log2epr = log2(elem_per_lane);
4223 
4224     assert(is_integral_type(elem_bt), "sanity");
4225     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4226 
4227     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4228     uint y_idx = ($idx$$constant >> log2epr) & 1;
4229     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4230     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4231     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4232   %}
4233   ins_pipe( pipe_slow );
4234 %}
4235 
4236 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4237   predicate(vector_length_in_bytes(n) == 64);
4238   match(Set dst (VectorInsert (Binary src val) idx));
4239   effect(TEMP vtmp);
4240   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4241   ins_encode %{
4242     assert(UseAVX > 2, "sanity");
4243 
4244     BasicType elem_bt = vector_element_basic_type(this);
4245     int elem_per_lane = 16/type2aelembytes(elem_bt);
4246     int log2epr = log2(elem_per_lane);
4247 
4248     assert(is_integral_type(elem_bt), "");
4249     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4250 
4251     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4252     uint y_idx = ($idx$$constant >> log2epr) & 3;
4253     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4254     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4255     __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4256   %}
4257   ins_pipe( pipe_slow );
4258 %}
4259 
4260 #ifdef _LP64
4261 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4262   predicate(vector_length(n) == 2);
4263   match(Set dst (VectorInsert (Binary dst val) idx));
4264   format %{ "vector_insert $dst,$val,$idx" %}
4265   ins_encode %{
4266     assert(UseSSE >= 4, "required");
4267     assert(vector_element_basic_type(this) == T_LONG, "");
4268     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4269 
4270     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4271   %}
4272   ins_pipe( pipe_slow );
4273 %}
4274 
4275 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4276   predicate(vector_length(n) == 4);
4277   match(Set dst (VectorInsert (Binary src val) idx));
4278   effect(TEMP vtmp);
4279   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4280   ins_encode %{
4281     assert(vector_element_basic_type(this) == T_LONG, "");
4282     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4283 
4284     uint x_idx = $idx$$constant & right_n_bits(1);
4285     uint y_idx = ($idx$$constant >> 1) & 1;
4286     int vlen_enc = Assembler::AVX_256bit;
4287     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4288     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4289     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4290   %}
4291   ins_pipe( pipe_slow );
4292 %}
4293 
4294 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4295   predicate(vector_length(n) == 8);
4296   match(Set dst (VectorInsert (Binary src val) idx));
4297   effect(TEMP vtmp);
4298   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4299   ins_encode %{
4300     assert(vector_element_basic_type(this) == T_LONG, "sanity");
4301     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4302 
4303     uint x_idx = $idx$$constant & right_n_bits(1);
4304     uint y_idx = ($idx$$constant >> 1) & 3;
4305     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4306     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4307     __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4308   %}
4309   ins_pipe( pipe_slow );
4310 %}
4311 #endif
4312 
4313 instruct insertF(vec dst, regF val, immU8 idx) %{
4314   predicate(vector_length(n) >= 2 &&
4315             vector_length(n) <= 4);
4316   match(Set dst (VectorInsert (Binary dst val) idx));
4317   format %{ "vector_insert $dst,$val,$idx" %}
4318   ins_encode %{
4319     assert(UseSSE >= 4, "sanity");
4320 
4321     assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4322     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4323 
4324     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4325   %}
4326   ins_pipe( pipe_slow );
4327 %}
4328 
4329 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4330   predicate(vector_length(n) >= 8);
4331   match(Set dst (VectorInsert (Binary src val) idx));
4332   effect(TEMP vtmp);
4333   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4334   ins_encode %{
4335     assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4336     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4337 
4338     int vlen = vector_length(this);
4339     uint x_idx = $idx$$constant & right_n_bits(2);
4340     if (vlen == 8) {
4341       uint y_idx = ($idx$$constant >> 2) & 1;
4342       int vlen_enc = Assembler::AVX_256bit;
4343       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4344       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4345       __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4346     } else {
4347       assert(vlen == 16, "sanity");
4348       uint y_idx = ($idx$$constant >> 2) & 3;
4349       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4350       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4351       __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4352     }
4353   %}
4354   ins_pipe( pipe_slow );
4355 %}
4356 
4357 #ifdef _LP64
4358 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4359   predicate(vector_length(n) == 2);
4360   match(Set dst (VectorInsert (Binary dst val) idx));
4361   effect(TEMP tmp);
4362   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4363   ins_encode %{
4364     assert(UseSSE >= 4, "sanity");
4365     assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4366     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4367 
4368     __ movq($tmp$$Register, $val$$XMMRegister);
4369     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4370   %}
4371   ins_pipe( pipe_slow );
4372 %}
4373 
4374 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4375   predicate(vector_length(n) == 4);
4376   match(Set dst (VectorInsert (Binary src val) idx));
4377   effect(TEMP vtmp, TEMP tmp);
4378   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4379   ins_encode %{
4380     assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4381     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4382 
4383     uint x_idx = $idx$$constant & right_n_bits(1);
4384     uint y_idx = ($idx$$constant >> 1) & 1;
4385     int vlen_enc = Assembler::AVX_256bit;
4386     __ movq($tmp$$Register, $val$$XMMRegister);
4387     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4388     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4389     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4390   %}
4391   ins_pipe( pipe_slow );
4392 %}
4393 
4394 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4395   predicate(vector_length(n) == 8);
4396   match(Set dst (VectorInsert (Binary src val) idx));
4397   effect(TEMP tmp, TEMP vtmp);
4398   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4399   ins_encode %{
4400     assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4401     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4402 
4403     uint x_idx = $idx$$constant & right_n_bits(1);
4404     uint y_idx = ($idx$$constant >> 1) & 3;
4405     __ movq($tmp$$Register, $val$$XMMRegister);
4406     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4407     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4408     __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4409   %}
4410   ins_pipe( pipe_slow );
4411 %}
4412 #endif
4413 
4414 // ====================REDUCTION ARITHMETIC=======================================
4415 
4416 // =======================Int Reduction==========================================
4417 
4418 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4419   predicate(vector_element_basic_type(n->in(2)) == T_INT &&
4420             vector_length(n->in(2)) < 16); // src2
4421   match(Set dst (AddReductionVI src1 src2));
4422   match(Set dst (MulReductionVI src1 src2));
4423   match(Set dst (AndReductionV  src1 src2));
4424   match(Set dst ( OrReductionV  src1 src2));
4425   match(Set dst (XorReductionV  src1 src2));
4426   match(Set dst (MinReductionV  src1 src2));
4427   match(Set dst (MaxReductionV  src1 src2));
4428   effect(TEMP vtmp1, TEMP vtmp2);


5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 // ------------------------------ Shift ---------------------------------------
5892 
5893 // Left and right shift count vectors are the same on x86
5894 // (only lowest bits of xmm reg are used for count).
5895 instruct vshiftcnt(vec dst, rRegI cnt) %{
5896   match(Set dst (LShiftCntV cnt));
5897   match(Set dst (RShiftCntV cnt));
5898   format %{ "movdl    $dst,$cnt\t! load shift count" %}
5899   ins_encode %{
5900     __ movdl($dst$$XMMRegister, $cnt$$Register);
5901   %}
5902   ins_pipe( pipe_slow );
5903 %}
5904 
5905 // Byte vector shift
5906 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5907   predicate(vector_length(n) <= 8);
5908   match(Set dst ( LShiftVB src shift));
5909   match(Set dst ( RShiftVB src shift));
5910   match(Set dst (URShiftVB src shift));
5911   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5912   format %{"vector_byte_shift $dst,$src,$shift" %}
5913   ins_encode %{
5914     assert(UseSSE > 3, "required");
5915     int opcode = this->ideal_Opcode();
5916     bool sign = (opcode == Op_URShiftVB) ? false : true;
5917     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5918     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5919     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5920     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5921     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5922   %}
5923   ins_pipe( pipe_slow );
5924 %}
5925 
5926 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5927   predicate(vector_length(n) == 16 && UseAVX <= 1);

5928   match(Set dst ( LShiftVB src shift));
5929   match(Set dst ( RShiftVB src shift));
5930   match(Set dst (URShiftVB src shift));
5931   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5932   format %{"vector_byte_shift $dst,$src,$shift" %}
5933   ins_encode %{
5934     assert(UseSSE > 3, "required");
5935     int opcode = this->ideal_Opcode();
5936     bool sign = (opcode == Op_URShiftVB) ? false : true;
5937     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5938     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5939     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5940     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5941     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5942     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5943     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5944     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5945     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5946   %}
5947   ins_pipe( pipe_slow );
5948 %}
5949 
5950 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5951   predicate(vector_length(n) == 16 && UseAVX > 1);

5952   match(Set dst ( LShiftVB src shift));
5953   match(Set dst ( RShiftVB src shift));
5954   match(Set dst (URShiftVB src shift));
5955   effect(TEMP dst, TEMP tmp, TEMP scratch);
5956   format %{"vector_byte_shift $dst,$src,$shift" %}
5957   ins_encode %{
5958     int opcode = this->ideal_Opcode();
5959     bool sign = (opcode == Op_URShiftVB) ? false : true;
5960     int vlen_enc = Assembler::AVX_256bit;
5961     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5962     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5963     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5964     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5965     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5966   %}
5967   ins_pipe( pipe_slow );
5968 %}
5969 
5970 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5971   predicate(vector_length(n) == 32);
5972   match(Set dst ( LShiftVB src shift));
5973   match(Set dst ( RShiftVB src shift));
5974   match(Set dst (URShiftVB src shift));
5975   effect(TEMP dst, TEMP tmp, TEMP scratch);
5976   format %{"vector_byte_shift $dst,$src,$shift" %}
5977   ins_encode %{
5978     assert(UseAVX > 1, "required");
5979     int opcode = this->ideal_Opcode();
5980     bool sign = (opcode == Op_URShiftVB) ? false : true;
5981     int vlen_enc = Assembler::AVX_256bit;
5982     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
5983     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5984     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5985     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5986     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5987     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5988     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5989     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5990     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5991   %}
5992   ins_pipe( pipe_slow );
5993 %}
5994 
5995 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5996   predicate(vector_length(n) == 64);
5997   match(Set dst ( LShiftVB src shift));
5998   match(Set dst  (RShiftVB src shift));
5999   match(Set dst (URShiftVB src shift));
6000   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6001   format %{"vector_byte_shift $dst,$src,$shift" %}
6002   ins_encode %{
6003     assert(UseAVX > 2, "required");
6004     int opcode = this->ideal_Opcode();
6005     bool sign = (opcode == Op_URShiftVB) ? false : true;
6006     int vlen_enc = Assembler::AVX_512bit;
6007     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6008     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6009     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6010     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6011     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6012     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6013     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6014     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6015     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6016     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6017     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6018     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6019   %}
6020   ins_pipe( pipe_slow );
6021 %}
6022 
6023 // Shorts vector logical right shift produces incorrect Java result
6024 // for negative data because java code convert short value into int with
6025 // sign extension before a shift. But char vectors are fine since chars are
6026 // unsigned values.
6027 // Shorts/Chars vector left shift
6028 instruct vshiftS(vec dst, vec src, vec shift) %{

6029   match(Set dst ( LShiftVS src shift));
6030   match(Set dst ( RShiftVS src shift));
6031   match(Set dst (URShiftVS src shift));
6032   effect(TEMP dst, USE src, USE shift);
6033   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6034   ins_encode %{
6035     int opcode = this->ideal_Opcode();
6036     if (UseAVX > 0) {
6037       int vlen_enc = vector_length_encoding(this);
6038       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6039     } else {
6040       int vlen = vector_length(this);
6041       if (vlen == 2) {
6042         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6043         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6044       } else if (vlen == 4) {
6045         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6046         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6047       } else {
6048         assert (vlen == 8, "sanity");
6049         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6050         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6051       }
6052     }
6053   %}
6054   ins_pipe( pipe_slow );
6055 %}
6056 
6057 // Integers vector left shift
6058 instruct vshiftI(vec dst, vec src, vec shift) %{

6059   match(Set dst ( LShiftVI src shift));
6060   match(Set dst ( RShiftVI src shift));
6061   match(Set dst (URShiftVI src shift));
6062   effect(TEMP dst, USE src, USE shift);
6063   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6064   ins_encode %{
6065     int opcode = this->ideal_Opcode();
6066     if (UseAVX > 0) {
6067       int vlen_enc = vector_length_encoding(this);
6068       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6069     } else {
6070       int vlen = vector_length(this);
6071       if (vlen == 2) {
6072         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6073         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6074       } else {
6075         assert(vlen == 4, "sanity");
6076         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6077         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6078       }
6079     }
6080   %}
6081   ins_pipe( pipe_slow );
6082 %}
6083 
6084 // Longs vector shift
6085 instruct vshiftL(vec dst, vec src, vec shift) %{

6086   match(Set dst ( LShiftVL src shift));
6087   match(Set dst (URShiftVL src shift));
6088   effect(TEMP dst, USE src, USE shift);
6089   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6090   ins_encode %{
6091     int opcode = this->ideal_Opcode();
6092     if (UseAVX > 0) {
6093       int vlen_enc = vector_length_encoding(this);
6094       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6095     } else {
6096       assert(vector_length(this) == 2, "");
6097       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6098       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6099     }
6100   %}
6101   ins_pipe( pipe_slow );
6102 %}
6103 
6104 // -------------------ArithmeticRightShift -----------------------------------
6105 // Long vector arithmetic right shift
6106 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6107   predicate(UseAVX <= 2);
6108   match(Set dst (RShiftVL src shift));
6109   effect(TEMP dst, TEMP tmp, TEMP scratch);
6110   format %{ "vshiftq $dst,$src,$shift" %}
6111   ins_encode %{
6112     uint vlen = vector_length(this);
6113     if (vlen == 2) {
6114       assert(UseSSE >= 2, "required");
6115       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6116       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6117       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6118       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6119       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6120       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6121     } else {
6122       assert(vlen == 4, "sanity");
6123       assert(UseAVX > 1, "required");
6124       int vlen_enc = Assembler::AVX_256bit;
6125       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6126       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6127       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6128       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6129       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6130     }
6131   %}
6132   ins_pipe( pipe_slow );
6133 %}
6134 
6135 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6136   predicate(UseAVX > 2);
6137   match(Set dst (RShiftVL src shift));
6138   format %{ "vshiftq $dst,$src,$shift" %}
6139   ins_encode %{
6140     int vlen_enc = vector_length_encoding(this);
6141     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6142   %}
6143   ins_pipe( pipe_slow );
6144 %}
6145 
6146 // ------------------- Variable Shift -----------------------------
6147 // Byte variable shift
6148 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6149   predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_BYTE &&

6150             !VM_Version::supports_avx512bw());
6151   match(Set dst ( VLShiftV src shift));
6152   match(Set dst ( VRShiftV src shift));
6153   match(Set dst (VURShiftV src shift));
6154   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6155   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6156   ins_encode %{
6157     assert(UseAVX >= 2, "required");
6158 
6159     int opcode = this->ideal_Opcode();
6160     int vlen_enc = Assembler::AVX_128bit;
6161     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6162     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6163   %}
6164   ins_pipe( pipe_slow );
6165 %}
6166 
6167 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6168   predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_BYTE &&

6169             !VM_Version::supports_avx512bw());
6170   match(Set dst ( VLShiftV src shift));
6171   match(Set dst ( VRShiftV src shift));
6172   match(Set dst (VURShiftV src shift));
6173   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6174   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6175   ins_encode %{
6176     assert(UseAVX >= 2, "required");
6177 
6178     int opcode = this->ideal_Opcode();
6179     int vlen_enc = Assembler::AVX_128bit;
6180     // Shift lower half and get word result in dst
6181     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6182 
6183     // Shift upper half and get word result in vtmp1
6184     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6185     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6186     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6187 
6188     // Merge and down convert the two word results to byte in dst
6189     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6190   %}
6191   ins_pipe( pipe_slow );
6192 %}
6193 
6194 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6195   predicate(vector_length(n) == 32 && vector_element_basic_type(n) == T_BYTE &&

6196             !VM_Version::supports_avx512bw());
6197   match(Set dst ( VLShiftV src shift));
6198   match(Set dst ( VRShiftV src shift));
6199   match(Set dst (VURShiftV src shift));
6200   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6201   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6202   ins_encode %{
6203     assert(UseAVX >= 2, "required");
6204 
6205     int opcode = this->ideal_Opcode();
6206     int vlen_enc = Assembler::AVX_128bit;
6207     // Process lower 128 bits and get result in dst
6208     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6209     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6210     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6211     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6212     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6213 
6214     // Process higher 128 bits and get result in vtmp3
6215     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6216     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6217     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6218     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6219     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6220     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6221     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6222 
6223     // Merge the two results in dst
6224     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6225   %}
6226   ins_pipe( pipe_slow );
6227 %}
6228 
6229 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6230   predicate(vector_length(n) <= 32 && vector_element_basic_type(n) == T_BYTE &&

6231             VM_Version::supports_avx512bw());
6232   match(Set dst ( VLShiftV src shift));
6233   match(Set dst ( VRShiftV src shift));
6234   match(Set dst (VURShiftV src shift));
6235   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6236   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6237   ins_encode %{
6238     assert(UseAVX > 2, "required");
6239 
6240     int opcode = this->ideal_Opcode();
6241     int vlen_enc = vector_length_encoding(this);
6242     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6243   %}
6244   ins_pipe( pipe_slow );
6245 %}
6246 
6247 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6248   predicate(vector_length(n) == 64 && vector_element_basic_type(n) == T_BYTE &&

6249             VM_Version::supports_avx512bw());
6250   match(Set dst ( VLShiftV src shift));
6251   match(Set dst ( VRShiftV src shift));
6252   match(Set dst (VURShiftV src shift));
6253   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6254   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6255   ins_encode %{
6256     assert(UseAVX > 2, "required");
6257 
6258     int opcode = this->ideal_Opcode();
6259     int vlen_enc = Assembler::AVX_256bit;
6260     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6261     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6262     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6263     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6264     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6265   %}
6266   ins_pipe( pipe_slow );
6267 %}
6268 
6269 // Short variable shift
6270 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6271   predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_SHORT &&

6272             !VM_Version::supports_avx512bw());
6273   match(Set dst (VLShiftV  src shift));
6274   match(Set dst (VRShiftV  src shift));
6275   match(Set dst (VURShiftV src shift));
6276   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6277   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6278   ins_encode %{
6279     assert(UseAVX >= 2, "required");
6280 
6281     int opcode = this->ideal_Opcode();
6282     bool sign = (opcode == Op_VURShiftV) ? false : true;
6283     int vlen_enc = Assembler::AVX_256bit;
6284     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6285     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6286     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6287     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6288     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6289     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6290   %}
6291   ins_pipe( pipe_slow );
6292 %}
6293 
6294 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6295   predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_SHORT &&

6296             !VM_Version::supports_avx512bw());
6297   match(Set dst (VLShiftV  src shift));
6298   match(Set dst (VRShiftV  src shift));
6299   match(Set dst (VURShiftV src shift));
6300   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6301   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6302   ins_encode %{
6303     assert(UseAVX >= 2, "required");
6304 
6305     int opcode = this->ideal_Opcode();
6306     bool sign = (opcode == Op_VURShiftV) ? false : true;
6307     int vlen_enc = Assembler::AVX_256bit;
6308     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6309     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6310     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6311     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6312     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6313 
6314     // Shift upper half, with result in dst usign vtmp1 as TEMP
6315     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6316     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6317     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6318     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6319     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6320     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6321 
6322     // Merge lower and upper half result into dst
6323     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6324     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6325   %}
6326   ins_pipe( pipe_slow );
6327 %}
6328 
6329 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6330   predicate(vector_element_basic_type(n) == T_SHORT &&
6331             VM_Version::supports_avx512bw());
6332   match(Set dst (VLShiftV src shift));
6333   match(Set dst (VRShiftV src shift));
6334   match(Set dst (VURShiftV src shift));
6335   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6336   ins_encode %{
6337     assert(UseAVX > 2, "required");
6338 
6339     int opcode = this->ideal_Opcode();
6340     int vlen_enc = vector_length_encoding(this);
6341     if (!VM_Version::supports_avx512vl()) {
6342       vlen_enc = Assembler::AVX_512bit;
6343     }
6344     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6345   %}
6346   ins_pipe( pipe_slow );
6347 %}
6348 
6349 //Integer variable shift
6350 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6351   predicate(vector_element_basic_type(n) == T_INT);
6352   match(Set dst ( VLShiftV src shift));
6353   match(Set dst ( VRShiftV src shift));
6354   match(Set dst (VURShiftV src shift));
6355   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6356   ins_encode %{
6357     assert(UseAVX >= 2, "required");
6358 
6359     int opcode = this->ideal_Opcode();
6360     int vlen_enc = vector_length_encoding(this);
6361     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6362   %}
6363   ins_pipe( pipe_slow );
6364 %}
6365 
6366 //Long variable shift
6367 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6368   predicate(vector_element_basic_type(n) == T_LONG);
6369   match(Set dst ( VLShiftV src shift));
6370   match(Set dst (VURShiftV src shift));
6371   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6372   ins_encode %{
6373     assert(UseAVX >= 2, "required");
6374 
6375     int opcode = this->ideal_Opcode();
6376     int vlen_enc = vector_length_encoding(this);
6377     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6378   %}
6379   ins_pipe( pipe_slow );
6380 %}
6381 
6382 //Long variable right shift arithmetic
6383 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6384   predicate(vector_length(n) <= 4 && vector_element_basic_type(n) == T_LONG &&

6385             UseAVX == 2);
6386   match(Set dst (VRShiftV src shift));
6387   effect(TEMP dst, TEMP vtmp);
6388   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6389   ins_encode %{
6390     int opcode = this->ideal_Opcode();
6391     int vlen_enc = vector_length_encoding(this);
6392     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6393                  $vtmp$$XMMRegister);
6394   %}
6395   ins_pipe( pipe_slow );
6396 %}
6397 
6398 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6399   predicate(vector_element_basic_type(n) == T_LONG &&
6400             UseAVX > 2);
6401   match(Set dst (VRShiftV src shift));
6402   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6403   ins_encode %{
6404     int opcode = this->ideal_Opcode();
6405     int vlen_enc = vector_length_encoding(this);
6406     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6407   %}
6408   ins_pipe( pipe_slow );
6409 %}
6410 
6411 // --------------------------------- AND --------------------------------------
6412 
6413 instruct vand(vec dst, vec src) %{
6414   predicate(UseAVX == 0);
6415   match(Set dst (AndV dst src));
6416   format %{ "pand    $dst,$src\t! and vectors" %}
6417   ins_encode %{
6418     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6419   %}
6420   ins_pipe( pipe_slow );
6421 %}


6817   match(Set dst (VectorCastD2X src));
6818   format %{ "vector_cast_d2x  $dst,$src\t!" %}
6819   ins_encode %{
6820     int vlen_enc = vector_length_encoding(this, $src);
6821     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6822   %}
6823   ins_pipe( pipe_slow );
6824 %}
6825 
6826 // --------------------------------- VectorMaskCmp --------------------------------------
6827 
6828 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6829   predicate(vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6830             vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6831             is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6832   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6833   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6834   ins_encode %{
6835     int vlen_enc = vector_length_encoding(this, $src1);
6836     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6837     if (vector_element_basic_type(this, $src1) == T_FLOAT)
6838       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6839     else
6840       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);

6841   %}
6842   ins_pipe( pipe_slow );
6843 %}
6844 
6845 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{
6846   predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6847             is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6848   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6849   effect(TEMP scratch);
6850   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6851   ins_encode %{
6852     int vlen_enc = Assembler::AVX_512bit;
6853     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6854     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
6855     KRegister mask = k0; // The comparison itself is not being masked.
6856     if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6857       __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6858       __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6859     } else {
6860       __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);


7199 
7200 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7201   match(Set dst (AbsVD  src));
7202   match(Set dst (NegVD  src));
7203   effect(TEMP scratch);
7204   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7205   ins_encode %{
7206     int opcode = this->ideal_Opcode();
7207     uint vlen = vector_length(this);
7208     if (vlen == 2) {
7209       assert(UseSSE >= 2, "required");
7210       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7211     } else {
7212       int vlen_enc = vector_length_encoding(this);
7213       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7214     }
7215   %}
7216   ins_pipe( pipe_slow );
7217 %}
7218 
7219 //------------------------------------- NOT --------------------------------------------
7220 
7221 instruct vnotB(vec dst, vec src) %{
7222   predicate(UseAVX == 0);
7223   match(Set dst (NotV src));
7224   effect(TEMP dst);
7225   format %{ "vector_not  $dst,$src\t!" %}
7226   ins_encode %{
7227     int vlen = vector_length_in_bytes(this);
7228     switch(vlen) {
7229       default:
7230         assert(0, "Incorrect vector length");
7231         break;
7232       case 4: {
7233         __ movdl($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
7234         __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7235       } break;
7236       case 8: {
7237         __ movq($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
7238         __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7239       } break;
7240       case 16: {
7241         __ movdqu($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
7242         __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7243       } break;
7244     }
7245   %}
7246   ins_pipe( pipe_slow );
7247 %}
7248 
7249 instruct vnotB_reg(vec dst, vec src, rRegP scratch) %{
7250   predicate(UseAVX > 0);
7251   match(Set dst (NotV src));
7252   effect(TEMP scratch);
7253   format %{ "vector_not  $dst,$src\t! using $scratch as rRegP" %}
7254   ins_encode %{
7255     int vlen_enc = vector_length_encoding(this);
7256     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vlen_enc, $scratch$$Register);
7257   %}
7258   ins_pipe( pipe_slow );
7259 %}
7260 
7261 //------------------------------------- VectorTest --------------------------------------------
7262 
7263 #ifdef _LP64
7264 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7265   predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7266   match(Set dst (VectorTest src1 src2 ));
7267   effect(KILL cr);
7268   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7269   ins_encode %{
7270     int vlen = vector_length_in_bytes(this, $src1);
7271     int vlen_enc = vector_length_encoding(vlen);
7272     if (vlen <= 32) {
7273       if (UseAVX == 0) {
7274         assert(vlen <= 16, "required");
7275         __ ptest($src1$$XMMRegister, $src2$$XMMRegister);
7276       } else {
7277         __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7278       }
7279     } else {
7280       KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.




1149     return NativeJump::instruction_size;
1150   }
1151 
1152 #ifdef _LP64
1153   static uint size_deopt_handler() {
1154     // three 5 byte instructions plus one move for unreachable address.
1155     return 15+3;
1156   }
1157 #else
1158   static uint size_deopt_handler() {
1159     // NativeCall instruction size is the same as NativeJump.
1160     // exception handler starts out as jump and can be patched to
1161     // a call be deoptimization.  (4932387)
1162     // Note that this value is also credited (in output.cpp) to
1163     // the size of the code section.
1164     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1165   }
1166 #endif
1167 };
1168 








1169 
1170 inline uint vector_length(const Node* n) {
1171   const TypeVect* vt = n->bottom_type()->is_vect();
1172   return vt->length();
1173 }
1174 
1175 inline uint vector_length(const MachNode* use, MachOper* opnd) {
1176   uint def_idx = use->operand_index(opnd);
1177   Node* def = use->in(def_idx);
1178   return def->bottom_type()->is_vect()->length();
1179 }
1180 
1181 inline uint vector_length_in_bytes(const Node* n) {
1182   const TypeVect* vt = n->bottom_type()->is_vect();
1183   return vt->length_in_bytes();
1184 }
1185 
1186 inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1187   uint def_idx = use->operand_index(opnd);
1188   Node* def = use->in(def_idx);


1207     case 32: return Assembler::AVX_256bit;
1208     case 64: return Assembler::AVX_512bit;
1209 
1210     default: {
1211       ShouldNotReachHere();
1212       return Assembler::AVX_NoVec;
1213     }
1214   }
1215 }
1216 
1217 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1218   return vector_length_encoding(vector_length_in_bytes(n));
1219 }
1220 
1221 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1222   uint def_idx = use->operand_index(opnd);
1223   Node* def = use->in(def_idx);
1224   return vector_length_encoding(def);
1225 }
1226 
1227 class Node::PD {
1228 public:
1229   enum NodeFlags {
1230     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1231     _last_flag             = Flag_intel_jcc_erratum
1232   };
1233 };
1234 
1235 %} // end source_hpp
1236 
1237 source %{
1238 
1239 #include "opto/addnode.hpp"
1240 #include "c2_intelJccErratum_x86.hpp"
1241 
1242 void PhaseOutput::pd_perform_mach_node_analysis() {
1243   if (VM_Version::has_intel_jcc_erratum()) {
1244     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1245     _buf_sizes._code += extra_padding;
1246   }
1247 }
1248 
1249 int MachNode::pd_alignment_required() const {
1250   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1251     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1252     return IntelJccErratum::largest_jcc_size() + 1;
1253   } else {
1254     return 1;


1488     case Op_CacheWB:
1489     case Op_CacheWBPreSync:
1490     case Op_CacheWBPostSync:
1491       if (!VM_Version::supports_data_cache_line_flush()) {
1492         return false;
1493       }
1494       break;
1495     case Op_ExtractB:
1496     case Op_ExtractL:
1497     case Op_ExtractI:
1498     case Op_RoundDoubleMode:
1499       if (UseSSE < 4) {
1500         return false;
1501       }
1502       break;
1503     case Op_RoundDoubleModeV:
1504       if (VM_Version::supports_avx() == false) {
1505         return false; // 128bit vroundpd is not available
1506       }
1507       break;








1508     case Op_LoadVectorGather:
1509       if (UseAVX < 2) {
1510         return false;
1511       }
1512       break;
1513     case Op_FmaVD:
1514     case Op_FmaVF:
1515       if (!UseFMA) {
1516         return false;
1517       }
1518       break;
1519     case Op_MacroLogicV:
1520       if (UseAVX < 3 || !UseVectorMacroLogic) {
1521         return false;
1522       }
1523       break;
1524 #ifndef _LP64
1525     case Op_AddReductionVF:
1526     case Op_AddReductionVD:
1527     case Op_MulReductionVF:
1528     case Op_MulReductionVD:
1529       if (UseSSE < 1) { // requires at least SSE
1530         return false;
1531       }
1532       break;
1533     case Op_MulAddVS2VI:
1534     case Op_RShiftVL:
1535     case Op_AbsVD:
1536     case Op_NegVD:
1537       if (UseSSE < 2) {
1538         return false;
1539       }
1540       break;
1541 #endif // !LP64
1542   }
1543   return true;  // Match rules are supported by default.
1544 }
1545 
1546 //------------------------------------------------------------------------
1547 
1548 // Identify extra cases that we might want to provide match rules for vector nodes and
1549 // other intrinsics guarded with vector length (vlen) and element type (bt).
1550 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1551   if (!match_rule_supported(opcode)) {
1552     return false;
1553   }
1554   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1555   //   * SSE2 supports 128bit vectors for all types;
1556   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1557   //   * AVX2 supports 256bit vectors for all types;
1558   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1559   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1560   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1561   // And MaxVectorSize is taken into account as well.

1562   if (!vector_size_supported(bt, vlen)) {
1563     return false;
1564   }
1565   // Special cases which require vector length follow:
1566   //   * implementation limitations
1567   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1568   //   * 128bit vroundpd instruction is present only in AVX1
1569   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1570   switch (opcode) {
1571     case Op_AbsVF:
1572     case Op_NegVF:
1573       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1574         return false; // 512bit vandps and vxorps are not available
1575       }
1576       break;
1577     case Op_AbsVD:
1578     case Op_NegVD:
1579     case Op_MulVL:
1580       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1581         return false; // 512bit vpmullq, vandpd and vxorpd are not available


1774     case MoveVec2Leg_rule:
1775     case MoveLeg2Vec_rule:
1776       return true;
1777     default:
1778       return false;
1779   }
1780 }
1781 
1782 bool Matcher::is_generic_vector(MachOper* opnd) {
1783   switch (opnd->opcode()) {
1784     case VEC:
1785     case LEGVEC:
1786       return true;
1787     default:
1788       return false;
1789   }
1790 }
1791 
1792 //------------------------------------------------------------------------
1793 
1794 bool Matcher::supports_vector_variable_shifts(void) {
1795   return (UseAVX >= 2);
1796 }
1797 
1798 const bool Matcher::has_predicated_vectors(void) {
1799   bool ret_value = false;
1800   if (UseAVX > 2) {
1801     ret_value = VM_Version::supports_avx512vl();
1802   }
1803 
1804   return ret_value;
1805 }
1806 
1807 const int Matcher::float_pressure(int default_pressure_threshold) {
1808   int float_pressure_threshold = default_pressure_threshold;
1809 #ifdef _LP64
1810   if (UseAVX > 2) {
1811     // Increase pressure threshold on machines with AVX3 which have
1812     // 2x more XMM registers.
1813     float_pressure_threshold = default_pressure_threshold * 2;
1814   }
1815 #endif
1816   return float_pressure_threshold;
1817 }


4176 %}
4177 
4178 instruct ReplD_zero(vec dst, immD0 zero) %{
4179   match(Set dst (ReplicateD zero));
4180   format %{ "replicateD $dst,$zero" %}
4181   ins_encode %{
4182     uint vlen = vector_length(this);
4183     if (vlen == 2) {
4184       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4185     } else {
4186       int vlen_enc = vector_length_encoding(this);
4187       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4188     }
4189   %}
4190   ins_pipe( fpu_reg_reg );
4191 %}
4192 
4193 // ====================VECTOR INSERT=======================================
4194 
4195 instruct insert(vec dst, rRegI val, immU8 idx) %{
4196   predicate(vector_length_in_bytes(n) < 32);

4197   match(Set dst (VectorInsert (Binary dst val) idx));
4198   format %{ "vector_insert $dst,$val,$idx" %}
4199   ins_encode %{
4200     assert(UseSSE >= 4, "required");
4201     assert(vector_length_in_bytes(this) >= 8, "required");
4202 
4203     BasicType elem_bt = vector_element_basic_type(this);
4204 
4205     assert(is_integral_type(elem_bt), "");
4206     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4207 
4208     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4209   %}
4210   ins_pipe( pipe_slow );
4211 %}
4212 
4213 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4214   predicate(vector_length_in_bytes(n) == 32);
4215   match(Set dst (VectorInsert (Binary src val) idx));
4216   effect(TEMP vtmp);
4217   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4218   ins_encode %{
4219     int vlen_enc = Assembler::AVX_256bit;
4220     BasicType elem_bt = vector_element_basic_type(this);
4221     int elem_per_lane = 16/type2aelembytes(elem_bt);
4222     int log2epr = log2(elem_per_lane);
4223 
4224     assert(is_integral_type(elem_bt), "sanity");
4225     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4226 
4227     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4228     uint y_idx = ($idx$$constant >> log2epr) & 1;
4229     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4230     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4231     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4232   %}
4233   ins_pipe( pipe_slow );
4234 %}
4235 
4236 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4237   predicate(vector_length_in_bytes(n) == 64);
4238   match(Set dst (VectorInsert (Binary src val) idx));
4239   effect(TEMP vtmp);
4240   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4241   ins_encode %{
4242     assert(UseAVX > 2, "sanity");
4243 
4244     BasicType elem_bt = vector_element_basic_type(this);
4245     int elem_per_lane = 16/type2aelembytes(elem_bt);
4246     int log2epr = log2(elem_per_lane);
4247 
4248     assert(is_integral_type(elem_bt), "");
4249     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4250 
4251     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4252     uint y_idx = ($idx$$constant >> log2epr) & 3;
4253     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4254     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4255     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4256   %}
4257   ins_pipe( pipe_slow );
4258 %}
4259 
4260 #ifdef _LP64
4261 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4262   predicate(vector_length(n) == 2);
4263   match(Set dst (VectorInsert (Binary dst val) idx));
4264   format %{ "vector_insert $dst,$val,$idx" %}
4265   ins_encode %{
4266     assert(UseSSE >= 4, "required");
4267     assert(vector_element_basic_type(this) == T_LONG, "");
4268     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4269 
4270     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4271   %}
4272   ins_pipe( pipe_slow );
4273 %}
4274 
4275 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4276   predicate(vector_length(n) == 4);
4277   match(Set dst (VectorInsert (Binary src val) idx));
4278   effect(TEMP vtmp);
4279   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4280   ins_encode %{
4281     assert(vector_element_basic_type(this) == T_LONG, "");
4282     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4283 
4284     uint x_idx = $idx$$constant & right_n_bits(1);
4285     uint y_idx = ($idx$$constant >> 1) & 1;
4286     int vlen_enc = Assembler::AVX_256bit;
4287     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4288     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4289     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4290   %}
4291   ins_pipe( pipe_slow );
4292 %}
4293 
4294 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4295   predicate(vector_length(n) == 8);
4296   match(Set dst (VectorInsert (Binary src val) idx));
4297   effect(TEMP vtmp);
4298   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4299   ins_encode %{
4300     assert(vector_element_basic_type(this) == T_LONG, "sanity");
4301     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4302 
4303     uint x_idx = $idx$$constant & right_n_bits(1);
4304     uint y_idx = ($idx$$constant >> 1) & 3;
4305     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4306     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4307     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4308   %}
4309   ins_pipe( pipe_slow );
4310 %}
4311 #endif
4312 
4313 instruct insertF(vec dst, regF val, immU8 idx) %{
4314   predicate(vector_length(n) < 8);

4315   match(Set dst (VectorInsert (Binary dst val) idx));
4316   format %{ "vector_insert $dst,$val,$idx" %}
4317   ins_encode %{
4318     assert(UseSSE >= 4, "sanity");
4319 
4320     assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4321     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4322 
4323     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4324   %}
4325   ins_pipe( pipe_slow );
4326 %}
4327 
4328 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4329   predicate(vector_length(n) >= 8);
4330   match(Set dst (VectorInsert (Binary src val) idx));
4331   effect(TEMP vtmp);
4332   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4333   ins_encode %{
4334     assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4335     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4336 
4337     int vlen = vector_length(this);
4338     uint x_idx = $idx$$constant & right_n_bits(2);
4339     if (vlen == 8) {
4340       uint y_idx = ($idx$$constant >> 2) & 1;
4341       int vlen_enc = Assembler::AVX_256bit;
4342       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4343       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4344       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4345     } else {
4346       assert(vlen == 16, "sanity");
4347       uint y_idx = ($idx$$constant >> 2) & 3;
4348       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4349       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4350       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4351     }
4352   %}
4353   ins_pipe( pipe_slow );
4354 %}
4355 
4356 #ifdef _LP64
4357 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4358   predicate(vector_length(n) == 2);
4359   match(Set dst (VectorInsert (Binary dst val) idx));
4360   effect(TEMP tmp);
4361   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4362   ins_encode %{
4363     assert(UseSSE >= 4, "sanity");
4364     assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4365     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4366 
4367     __ movq($tmp$$Register, $val$$XMMRegister);
4368     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4369   %}
4370   ins_pipe( pipe_slow );
4371 %}
4372 
4373 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4374   predicate(vector_length(n) == 4);
4375   match(Set dst (VectorInsert (Binary src val) idx));
4376   effect(TEMP vtmp, TEMP tmp);
4377   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4378   ins_encode %{
4379     assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4380     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4381 
4382     uint x_idx = $idx$$constant & right_n_bits(1);
4383     uint y_idx = ($idx$$constant >> 1) & 1;
4384     int vlen_enc = Assembler::AVX_256bit;
4385     __ movq($tmp$$Register, $val$$XMMRegister);
4386     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4387     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4388     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4389   %}
4390   ins_pipe( pipe_slow );
4391 %}
4392 
4393 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4394   predicate(vector_length(n) == 8);
4395   match(Set dst (VectorInsert (Binary src val) idx));
4396   effect(TEMP tmp, TEMP vtmp);
4397   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4398   ins_encode %{
4399     assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4400     assert($idx$$constant < (int)vector_length(this), "out of bounds");
4401 
4402     uint x_idx = $idx$$constant & right_n_bits(1);
4403     uint y_idx = ($idx$$constant >> 1) & 3;
4404     __ movq($tmp$$Register, $val$$XMMRegister);
4405     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4406     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4407     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4408   %}
4409   ins_pipe( pipe_slow );
4410 %}
4411 #endif
4412 
4413 // ====================REDUCTION ARITHMETIC=======================================
4414 
4415 // =======================Int Reduction==========================================
4416 
4417 instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4418   predicate(vector_element_basic_type(n->in(2)) == T_INT &&
4419             vector_length(n->in(2)) < 16); // src2
4420   match(Set dst (AddReductionVI src1 src2));
4421   match(Set dst (MulReductionVI src1 src2));
4422   match(Set dst (AndReductionV  src1 src2));
4423   match(Set dst ( OrReductionV  src1 src2));
4424   match(Set dst (XorReductionV  src1 src2));
4425   match(Set dst (MinReductionV  src1 src2));
4426   match(Set dst (MaxReductionV  src1 src2));
4427   effect(TEMP vtmp1, TEMP vtmp2);


5886   %}
5887   ins_pipe( pipe_slow );
5888 %}
5889 
5890 // ------------------------------ Shift ---------------------------------------
5891 
5892 // Left and right shift count vectors are the same on x86
5893 // (only lowest bits of xmm reg are used for count).
5894 instruct vshiftcnt(vec dst, rRegI cnt) %{
5895   match(Set dst (LShiftCntV cnt));
5896   match(Set dst (RShiftCntV cnt));
5897   format %{ "movdl    $dst,$cnt\t! load shift count" %}
5898   ins_encode %{
5899     __ movdl($dst$$XMMRegister, $cnt$$Register);
5900   %}
5901   ins_pipe( pipe_slow );
5902 %}
5903 
5904 // Byte vector shift
5905 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5906   predicate(vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
5907   match(Set dst ( LShiftVB src shift));
5908   match(Set dst ( RShiftVB src shift));
5909   match(Set dst (URShiftVB src shift));
5910   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5911   format %{"vector_byte_shift $dst,$src,$shift" %}
5912   ins_encode %{
5913     assert(UseSSE > 3, "required");
5914     int opcode = this->ideal_Opcode();
5915     bool sign = (opcode != Op_URShiftVB);
5916     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5917     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5918     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5919     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5920     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5921   %}
5922   ins_pipe( pipe_slow );
5923 %}
5924 
5925 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5926   predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5927             UseAVX <= 1);
5928   match(Set dst ( LShiftVB src shift));
5929   match(Set dst ( RShiftVB src shift));
5930   match(Set dst (URShiftVB src shift));
5931   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5932   format %{"vector_byte_shift $dst,$src,$shift" %}
5933   ins_encode %{
5934     assert(UseSSE > 3, "required");
5935     int opcode = this->ideal_Opcode();
5936     bool sign = (opcode != Op_URShiftVB);
5937     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5938     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5939     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5940     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5941     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5942     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5943     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5944     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5945     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5946   %}
5947   ins_pipe( pipe_slow );
5948 %}
5949 
5950 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5951   predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5952             UseAVX > 1);
5953   match(Set dst ( LShiftVB src shift));
5954   match(Set dst ( RShiftVB src shift));
5955   match(Set dst (URShiftVB src shift));
5956   effect(TEMP dst, TEMP tmp, TEMP scratch);
5957   format %{"vector_byte_shift $dst,$src,$shift" %}
5958   ins_encode %{
5959     int opcode = this->ideal_Opcode();
5960     bool sign = (opcode != Op_URShiftVB);
5961     int vlen_enc = Assembler::AVX_256bit;
5962     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5963     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5964     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5965     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5966     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5967   %}
5968   ins_pipe( pipe_slow );
5969 %}
5970 
5971 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5972   predicate(vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
5973   match(Set dst ( LShiftVB src shift));
5974   match(Set dst ( RShiftVB src shift));
5975   match(Set dst (URShiftVB src shift));
5976   effect(TEMP dst, TEMP tmp, TEMP scratch);
5977   format %{"vector_byte_shift $dst,$src,$shift" %}
5978   ins_encode %{
5979     assert(UseAVX > 1, "required");
5980     int opcode = this->ideal_Opcode();
5981     bool sign = (opcode != Op_URShiftVB);
5982     int vlen_enc = Assembler::AVX_256bit;
5983     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
5984     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5985     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5986     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5987     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5988     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5989     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5990     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5991     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5992   %}
5993   ins_pipe( pipe_slow );
5994 %}
5995 
5996 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5997   predicate(vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
5998   match(Set dst ( LShiftVB src shift));
5999   match(Set dst  (RShiftVB src shift));
6000   match(Set dst (URShiftVB src shift));
6001   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6002   format %{"vector_byte_shift $dst,$src,$shift" %}
6003   ins_encode %{
6004     assert(UseAVX > 2, "required");
6005     int opcode = this->ideal_Opcode();
6006     bool sign = (opcode != Op_URShiftVB);
6007     int vlen_enc = Assembler::AVX_512bit;
6008     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6009     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6010     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6011     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6012     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6013     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6014     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6015     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6016     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6017     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6018     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6019     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6020   %}
6021   ins_pipe( pipe_slow );
6022 %}
6023 
6024 // Shorts vector logical right shift produces incorrect Java result
6025 // for negative data because java code convert short value into int with
6026 // sign extension before a shift. But char vectors are fine since chars are
6027 // unsigned values.
6028 // Shorts/Chars vector left shift
6029 instruct vshiftS(vec dst, vec src, vec shift) %{
6030   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6031   match(Set dst ( LShiftVS src shift));
6032   match(Set dst ( RShiftVS src shift));
6033   match(Set dst (URShiftVS src shift));
6034   effect(TEMP dst, USE src, USE shift);
6035   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6036   ins_encode %{
6037     int opcode = this->ideal_Opcode();
6038     if (UseAVX > 0) {
6039       int vlen_enc = vector_length_encoding(this);
6040       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6041     } else {
6042       int vlen = vector_length(this);
6043       if (vlen == 2) {
6044         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6045         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6046       } else if (vlen == 4) {
6047         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6048         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6049       } else {
6050         assert (vlen == 8, "sanity");
6051         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6052         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6053       }
6054     }
6055   %}
6056   ins_pipe( pipe_slow );
6057 %}
6058 
6059 // Integers vector left shift
6060 instruct vshiftI(vec dst, vec src, vec shift) %{
6061   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6062   match(Set dst ( LShiftVI src shift));
6063   match(Set dst ( RShiftVI src shift));
6064   match(Set dst (URShiftVI src shift));
6065   effect(TEMP dst, USE src, USE shift);
6066   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6067   ins_encode %{
6068     int opcode = this->ideal_Opcode();
6069     if (UseAVX > 0) {
6070       int vlen_enc = vector_length_encoding(this);
6071       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6072     } else {
6073       int vlen = vector_length(this);
6074       if (vlen == 2) {
6075         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6076         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6077       } else {
6078         assert(vlen == 4, "sanity");
6079         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6080         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6081       }
6082     }
6083   %}
6084   ins_pipe( pipe_slow );
6085 %}
6086 
6087 // Longs vector shift
6088 instruct vshiftL(vec dst, vec src, vec shift) %{
6089   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6090   match(Set dst ( LShiftVL src shift));
6091   match(Set dst (URShiftVL src shift));
6092   effect(TEMP dst, USE src, USE shift);
6093   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6094   ins_encode %{
6095     int opcode = this->ideal_Opcode();
6096     if (UseAVX > 0) {
6097       int vlen_enc = vector_length_encoding(this);
6098       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6099     } else {
6100       assert(vector_length(this) == 2, "");
6101       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6102       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6103     }
6104   %}
6105   ins_pipe( pipe_slow );
6106 %}
6107 
6108 // -------------------ArithmeticRightShift -----------------------------------
6109 // Long vector arithmetic right shift
6110 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6111   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6112   match(Set dst (RShiftVL src shift));
6113   effect(TEMP dst, TEMP tmp, TEMP scratch);
6114   format %{ "vshiftq $dst,$src,$shift" %}
6115   ins_encode %{
6116     uint vlen = vector_length(this);
6117     if (vlen == 2) {
6118       assert(UseSSE >= 2, "required");
6119       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6120       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6121       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6122       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6123       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6124       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6125     } else {
6126       assert(vlen == 4, "sanity");
6127       assert(UseAVX > 1, "required");
6128       int vlen_enc = Assembler::AVX_256bit;
6129       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6130       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6131       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6132       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6133       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6134     }
6135   %}
6136   ins_pipe( pipe_slow );
6137 %}
6138 
6139 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6140   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6141   match(Set dst (RShiftVL src shift));
6142   format %{ "vshiftq $dst,$src,$shift" %}
6143   ins_encode %{
6144     int vlen_enc = vector_length_encoding(this);
6145     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6146   %}
6147   ins_pipe( pipe_slow );
6148 %}
6149 
6150 // ------------------- Variable Shift -----------------------------
6151 // Byte variable shift
6152 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6153   predicate(vector_length(n) <= 8 &&
6154             !VectorNode::is_vshift_cnt(n->in(2)) &&
6155             !VM_Version::supports_avx512bw());
6156   match(Set dst ( LShiftVB src shift));
6157   match(Set dst ( RShiftVB src shift));
6158   match(Set dst (URShiftVB src shift));
6159   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6160   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6161   ins_encode %{
6162     assert(UseAVX >= 2, "required");
6163 
6164     int opcode = this->ideal_Opcode();
6165     int vlen_enc = Assembler::AVX_128bit;
6166     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6167     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6168   %}
6169   ins_pipe( pipe_slow );
6170 %}
6171 
6172 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6173   predicate(vector_length(n) == 16 &&
6174             !VectorNode::is_vshift_cnt(n->in(2)) &&
6175             !VM_Version::supports_avx512bw());
6176   match(Set dst ( LShiftVB src shift));
6177   match(Set dst ( RShiftVB src shift));
6178   match(Set dst (URShiftVB src shift));
6179   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6180   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6181   ins_encode %{
6182     assert(UseAVX >= 2, "required");
6183 
6184     int opcode = this->ideal_Opcode();
6185     int vlen_enc = Assembler::AVX_128bit;
6186     // Shift lower half and get word result in dst
6187     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6188 
6189     // Shift upper half and get word result in vtmp1
6190     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6191     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6192     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6193 
6194     // Merge and down convert the two word results to byte in dst
6195     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6196   %}
6197   ins_pipe( pipe_slow );
6198 %}
6199 
6200 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6201   predicate(vector_length(n) == 32 &&
6202             !VectorNode::is_vshift_cnt(n->in(2)) &&
6203             !VM_Version::supports_avx512bw());
6204   match(Set dst ( LShiftVB src shift));
6205   match(Set dst ( RShiftVB src shift));
6206   match(Set dst (URShiftVB src shift));
6207   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6208   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6209   ins_encode %{
6210     assert(UseAVX >= 2, "required");
6211 
6212     int opcode = this->ideal_Opcode();
6213     int vlen_enc = Assembler::AVX_128bit;
6214     // Process lower 128 bits and get result in dst
6215     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6216     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6217     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6218     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6219     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6220 
6221     // Process higher 128 bits and get result in vtmp3
6222     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6223     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6224     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6225     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6226     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6227     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6228     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6229 
6230     // Merge the two results in dst
6231     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6232   %}
6233   ins_pipe( pipe_slow );
6234 %}
6235 
6236 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6237   predicate(vector_length(n) <= 32 &&
6238             !VectorNode::is_vshift_cnt(n->in(2)) &&
6239             VM_Version::supports_avx512bw());
6240   match(Set dst ( LShiftVB src shift));
6241   match(Set dst ( RShiftVB src shift));
6242   match(Set dst (URShiftVB src shift));
6243   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6244   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6245   ins_encode %{
6246     assert(UseAVX > 2, "required");
6247 
6248     int opcode = this->ideal_Opcode();
6249     int vlen_enc = vector_length_encoding(this);
6250     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6251   %}
6252   ins_pipe( pipe_slow );
6253 %}
6254 
6255 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6256   predicate(vector_length(n) == 64 &&
6257             !VectorNode::is_vshift_cnt(n->in(2)) &&
6258             VM_Version::supports_avx512bw());
6259   match(Set dst ( LShiftVB src shift));
6260   match(Set dst ( RShiftVB src shift));
6261   match(Set dst (URShiftVB src shift));
6262   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6263   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6264   ins_encode %{
6265     assert(UseAVX > 2, "required");
6266 
6267     int opcode = this->ideal_Opcode();
6268     int vlen_enc = Assembler::AVX_256bit;
6269     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6270     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6271     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6272     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6273     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6274   %}
6275   ins_pipe( pipe_slow );
6276 %}
6277 
6278 // Short variable shift
6279 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6280   predicate(vector_length(n) <= 8 &&
6281             !VectorNode::is_vshift_cnt(n->in(2)) &&
6282             !VM_Version::supports_avx512bw());
6283   match(Set dst ( LShiftVS src shift));
6284   match(Set dst ( RShiftVS src shift));
6285   match(Set dst (URShiftVS src shift));
6286   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6287   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6288   ins_encode %{
6289     assert(UseAVX >= 2, "required");
6290 
6291     int opcode = this->ideal_Opcode();
6292     bool sign = (opcode != Op_URShiftVS);
6293     int vlen_enc = Assembler::AVX_256bit;
6294     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6295     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6296     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6297     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6298     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6299     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6300   %}
6301   ins_pipe( pipe_slow );
6302 %}
6303 
6304 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6305   predicate(vector_length(n) == 16 &&
6306             !VectorNode::is_vshift_cnt(n->in(2)) &&
6307             !VM_Version::supports_avx512bw());
6308   match(Set dst ( LShiftVS src shift));
6309   match(Set dst ( RShiftVS src shift));
6310   match(Set dst (URShiftVS src shift));
6311   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6312   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6313   ins_encode %{
6314     assert(UseAVX >= 2, "required");
6315 
6316     int opcode = this->ideal_Opcode();
6317     bool sign = (opcode != Op_URShiftVS);
6318     int vlen_enc = Assembler::AVX_256bit;
6319     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6320     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6321     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6322     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6323     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6324 
6325     // Shift upper half, with result in dst usign vtmp1 as TEMP
6326     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6327     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6328     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6329     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6330     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6331     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6332 
6333     // Merge lower and upper half result into dst
6334     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6335     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6336   %}
6337   ins_pipe( pipe_slow );
6338 %}
6339 
6340 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6341   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6342             VM_Version::supports_avx512bw());
6343   match(Set dst ( LShiftVS src shift));
6344   match(Set dst ( RShiftVS src shift));
6345   match(Set dst (URShiftVS src shift));
6346   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6347   ins_encode %{
6348     assert(UseAVX > 2, "required");
6349 
6350     int opcode = this->ideal_Opcode();
6351     int vlen_enc = vector_length_encoding(this);
6352     if (!VM_Version::supports_avx512vl()) {
6353       vlen_enc = Assembler::AVX_512bit;
6354     }
6355     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6356   %}
6357   ins_pipe( pipe_slow );
6358 %}
6359 
6360 //Integer variable shift
6361 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6362   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6363   match(Set dst ( LShiftVI src shift));
6364   match(Set dst ( RShiftVI src shift));
6365   match(Set dst (URShiftVI src shift));
6366   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6367   ins_encode %{
6368     assert(UseAVX >= 2, "required");
6369 
6370     int opcode = this->ideal_Opcode();
6371     int vlen_enc = vector_length_encoding(this);
6372     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6373   %}
6374   ins_pipe( pipe_slow );
6375 %}
6376 
6377 //Long variable shift
6378 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6379   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6380   match(Set dst ( LShiftVL src shift));
6381   match(Set dst (URShiftVL src shift));
6382   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6383   ins_encode %{
6384     assert(UseAVX >= 2, "required");
6385 
6386     int opcode = this->ideal_Opcode();
6387     int vlen_enc = vector_length_encoding(this);
6388     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6389   %}
6390   ins_pipe( pipe_slow );
6391 %}
6392 
6393 //Long variable right shift arithmetic
6394 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6395   predicate(vector_length(n) <= 4 &&
6396             !VectorNode::is_vshift_cnt(n->in(2)) &&
6397             UseAVX == 2);
6398   match(Set dst (RShiftVL src shift));
6399   effect(TEMP dst, TEMP vtmp);
6400   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6401   ins_encode %{
6402     int opcode = this->ideal_Opcode();
6403     int vlen_enc = vector_length_encoding(this);
6404     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6405                  $vtmp$$XMMRegister);
6406   %}
6407   ins_pipe( pipe_slow );
6408 %}
6409 
6410 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6411   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6412             UseAVX > 2);
6413   match(Set dst (RShiftVL src shift));
6414   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6415   ins_encode %{
6416     int opcode = this->ideal_Opcode();
6417     int vlen_enc = vector_length_encoding(this);
6418     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6419   %}
6420   ins_pipe( pipe_slow );
6421 %}
6422 
6423 // --------------------------------- AND --------------------------------------
6424 
6425 instruct vand(vec dst, vec src) %{
6426   predicate(UseAVX == 0);
6427   match(Set dst (AndV dst src));
6428   format %{ "pand    $dst,$src\t! and vectors" %}
6429   ins_encode %{
6430     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6431   %}
6432   ins_pipe( pipe_slow );
6433 %}


6829   match(Set dst (VectorCastD2X src));
6830   format %{ "vector_cast_d2x  $dst,$src\t!" %}
6831   ins_encode %{
6832     int vlen_enc = vector_length_encoding(this, $src);
6833     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6834   %}
6835   ins_pipe( pipe_slow );
6836 %}
6837 
6838 // --------------------------------- VectorMaskCmp --------------------------------------
6839 
6840 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6841   predicate(vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6842             vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6843             is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6844   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6845   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6846   ins_encode %{
6847     int vlen_enc = vector_length_encoding(this, $src1);
6848     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6849     if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6850       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6851     } else {
6852       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6853     }
6854   %}
6855   ins_pipe( pipe_slow );
6856 %}
6857 
6858 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{
6859   predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6860             is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6861   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6862   effect(TEMP scratch);
6863   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6864   ins_encode %{
6865     int vlen_enc = Assembler::AVX_512bit;
6866     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6867     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
6868     KRegister mask = k0; // The comparison itself is not being masked.
6869     if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6870       __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6871       __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6872     } else {
6873       __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);


7212 
7213 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7214   match(Set dst (AbsVD  src));
7215   match(Set dst (NegVD  src));
7216   effect(TEMP scratch);
7217   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7218   ins_encode %{
7219     int opcode = this->ideal_Opcode();
7220     uint vlen = vector_length(this);
7221     if (vlen == 2) {
7222       assert(UseSSE >= 2, "required");
7223       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7224     } else {
7225       int vlen_enc = vector_length_encoding(this);
7226       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7227     }
7228   %}
7229   ins_pipe( pipe_slow );
7230 %}
7231 










































7232 //------------------------------------- VectorTest --------------------------------------------
7233 
7234 #ifdef _LP64
7235 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7236   predicate(static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7237   match(Set dst (VectorTest src1 src2 ));
7238   effect(KILL cr);
7239   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7240   ins_encode %{
7241     int vlen = vector_length_in_bytes(this, $src1);
7242     int vlen_enc = vector_length_encoding(vlen);
7243     if (vlen <= 32) {
7244       if (UseAVX == 0) {
7245         assert(vlen <= 16, "required");
7246         __ ptest($src1$$XMMRegister, $src2$$XMMRegister);
7247       } else {
7248         __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7249       }
7250     } else {
7251       KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.


< prev index next >