< prev index next >

src/cpu/x86/vm/x86.ad

Print this page




1699     case Op_SqrtVD:
1700       if (UseAVX < 1) // enabled for AVX only
1701         ret_value = false;
1702       break;
1703     case Op_CompareAndSwapL:
1704 #ifdef _LP64
1705     case Op_CompareAndSwapP:
1706 #endif
1707       if (!VM_Version::supports_cx8())
1708         ret_value = false;
1709       break;
1710     case Op_CMoveVD:
1711       if (UseAVX < 1 || UseAVX > 2)
1712         ret_value = false;
1713       break;
1714   }
1715 
1716   return ret_value;  // Per default match rules are supported.
1717 }
1718 

































1719 const int Matcher::float_pressure(int default_pressure_threshold) {
1720   int float_pressure_threshold = default_pressure_threshold;
1721 #ifdef _LP64
1722   if (UseAVX > 2) {
1723     // Increase pressure threshold on machines with AVX3 which have
1724     // 2x more XMM registers.
1725     float_pressure_threshold = default_pressure_threshold * 2;
1726   }
1727 #endif
1728   return float_pressure_threshold;
1729 }
1730 
1731 // Max vector size in bytes. 0 if not supported.
1732 const int Matcher::vector_width_in_bytes(BasicType bt) {
1733   assert(is_java_primitive(bt), "only primitive type vectors");
1734   if (UseSSE < 2) return 0;
1735   // SSE2 supports 128bit vectors for all types.
1736   // AVX2 supports 256bit vectors for all types.
1737   // AVX2/EVEX supports 512bit vectors for all types.
1738   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;


1742   // Use flag to limit vector size.
1743   size = MIN2(size,(int)MaxVectorSize);
1744   // Minimum 2 values in vector (or 4 for bytes).
1745   switch (bt) {
1746   case T_DOUBLE:
1747   case T_LONG:
1748     if (size < 16) return 0;
1749     break;
1750   case T_FLOAT:
1751   case T_INT:
1752     if (size < 8) return 0;
1753     break;
1754   case T_BOOLEAN:
1755     if (size < 4) return 0;
1756     break;
1757   case T_CHAR:
1758     if (size < 4) return 0;
1759     break;
1760   case T_BYTE:
1761     if (size < 4) return 0;
1762     if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
1763     break;
1764   case T_SHORT:
1765     if (size < 4) return 0;
1766     if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
1767     break;
1768   default:
1769     ShouldNotReachHere();
1770   }
1771   return size;
1772 }
1773 
1774 // Limits on vector size (number of elements) loaded into vector.
1775 const int Matcher::max_vector_size(const BasicType bt) {
1776   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1777 }
1778 const int Matcher::min_vector_size(const BasicType bt) {
1779   int max_size = max_vector_size(bt);
1780   // Min size which can be loaded into vector is 4 bytes.
1781   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1782   return MIN2(size,max_size);
1783 }
1784 
1785 // Vector ideal reg corresponding to specidied size in bytes
1786 const int Matcher::vector_ideal_reg(int size) {


1950         break;
1951       case Op_VecD:
1952         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1953         break;
1954        case Op_VecX:
1955         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1956         break;
1957       case Op_VecY:
1958       case Op_VecZ:
1959         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1960         break;
1961       default:
1962         ShouldNotReachHere();
1963       }
1964     }
1965 #endif
1966   }
1967   bool is_single_byte = false;
1968   int vec_len = 0;
1969   if ((UseAVX > 2) && (stack_offset != 0)) {


1970     switch (ireg) {
1971         case Op_VecS:


1972     case Op_VecD:



1973     case Op_VecX:
1974           break;
1975         case Op_VecY:
1976           vec_len = 1;
1977           break;
1978     case Op_VecZ:
1979           vec_len = 2;
1980           break;
1981     }
1982     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
1983   }
1984   int offset_size = 0;
1985   int size = 5;
1986   if (UseAVX > 2 ) {
1987     if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { 
1988       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1989       size += 2; // Need an additional two bytes for EVEX encoding
1990     } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { 
1991       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1992     } else {
1993       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1994       size += 2; // Need an additional two bytes for EVEX encodding
1995     }
1996   } else {
1997     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1998   }
1999   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2000   return size+offset_size;
2001 }
2002 
2003 static inline jfloat replicate4_imm(int con, int width) {
2004   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2005   assert(width == 1 || width == 2, "only byte or short types here");
2006   int bit_width = width * 8;
2007   jint val = con;
2008   val &= (1 << bit_width) - 1;  // mask off sign bits
2009   while(bit_width < 32) {
2010     val |= (val << bit_width);


2694   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2695   ins_cost(150);
2696   ins_encode %{
2697     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2698   %}
2699   ins_pipe(pipe_slow);
2700 %}
2701 
2702 instruct absF_reg(regF dst) %{
2703   predicate((UseSSE>=1) && (UseAVX == 0));
2704   match(Set dst (AbsF dst));
2705   ins_cost(150);
2706   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2707   ins_encode %{
2708     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2709   %}
2710   ins_pipe(pipe_slow);
2711 %}
2712 
2713 instruct absF_reg_reg(regF dst, regF src) %{
2714   predicate(UseAVX > 0);









































2715   match(Set dst (AbsF src));
2716   ins_cost(150);
2717   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2718   ins_encode %{
2719     int vector_len = 0;
2720     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2721               ExternalAddress(float_signmask()), vector_len);
2722   %}
2723   ins_pipe(pipe_slow);
2724 %}

2725 
2726 instruct absD_reg(regD dst) %{
2727   predicate((UseSSE>=2) && (UseAVX == 0));
2728   match(Set dst (AbsD dst));
2729   ins_cost(150);
2730   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2731             "# abs double by sign masking" %}
2732   ins_encode %{
2733     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2734   %}
2735   ins_pipe(pipe_slow);
2736 %}
2737 
2738 instruct absD_reg_reg(regD dst, regD src) %{
2739   predicate(UseAVX > 0);











































2740   match(Set dst (AbsD src));
2741   ins_cost(150);
2742   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2743             "# abs double by sign masking" %}
2744   ins_encode %{
2745     int vector_len = 0;
2746     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2747               ExternalAddress(double_signmask()), vector_len);
2748   %}
2749   ins_pipe(pipe_slow);
2750 %}

2751 
2752 instruct negF_reg(regF dst) %{
2753   predicate((UseSSE>=1) && (UseAVX == 0));
2754   match(Set dst (NegF dst));
2755   ins_cost(150);
2756   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2757   ins_encode %{
2758     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2759   %}
2760   ins_pipe(pipe_slow);
2761 %}
2762 
2763 instruct negF_reg_reg(regF dst, regF src) %{
2764   predicate(UseAVX > 0);
2765   match(Set dst (NegF src));
2766   ins_cost(150);
2767   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2768   ins_encode %{
2769     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2770                  ExternalAddress(float_signflip()));


4537 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4538   predicate(UseSSE > 2 && UseAVX == 0);
4539   match(Set dst (AddReductionVI src1 src2));
4540   effect(TEMP tmp2, TEMP tmp);
4541   format %{ "movdqu  $tmp2,$src2\n\t"
4542             "phaddd  $tmp2,$tmp2\n\t"
4543             "movd    $tmp,$src1\n\t"
4544             "paddd   $tmp,$tmp2\n\t"
4545             "movd    $dst,$tmp\t! add reduction2I" %}
4546   ins_encode %{
4547     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4548     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4549     __ movdl($tmp$$XMMRegister, $src1$$Register);
4550     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4551     __ movdl($dst$$Register, $tmp$$XMMRegister);
4552   %}
4553   ins_pipe( pipe_slow );
4554 %}
4555 
4556 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4557   predicate(UseAVX > 0 && UseAVX < 3);
4558   match(Set dst (AddReductionVI src1 src2));
4559   effect(TEMP tmp, TEMP tmp2);
4560   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4561             "movd     $tmp2,$src1\n\t"
4562             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4563             "movd     $dst,$tmp2\t! add reduction2I" %}
4564   ins_encode %{
4565     int vector_len = 0;
4566     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4567     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4568     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4569     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4570   %}
4571   ins_pipe( pipe_slow );
4572 %}
4573 
4574 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4575   predicate(UseAVX > 2);
4576   match(Set dst (AddReductionVI src1 src2));
4577   effect(TEMP tmp, TEMP tmp2);


4596   match(Set dst (AddReductionVI src1 src2));
4597   effect(TEMP tmp2, TEMP tmp);
4598   format %{ "movdqu  $tmp2,$src2\n\t"
4599             "phaddd  $tmp2,$tmp2\n\t"
4600             "phaddd  $tmp2,$tmp2\n\t"
4601             "movd    $tmp,$src1\n\t"
4602             "paddd   $tmp,$tmp2\n\t"
4603             "movd    $dst,$tmp\t! add reduction4I" %}
4604   ins_encode %{
4605     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4606     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4607     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4608     __ movdl($tmp$$XMMRegister, $src1$$Register);
4609     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4610     __ movdl($dst$$Register, $tmp$$XMMRegister);
4611   %}
4612   ins_pipe( pipe_slow );
4613 %}
4614 
4615 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4616   predicate(UseAVX > 0 && UseAVX < 3);
4617   match(Set dst (AddReductionVI src1 src2));
4618   effect(TEMP tmp, TEMP tmp2);
4619   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4620             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4621             "movd     $tmp2,$src1\n\t"
4622             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4623             "movd     $dst,$tmp2\t! add reduction4I" %}
4624   ins_encode %{
4625     int vector_len = 0;
4626     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4627     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4628     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4629     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4630     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4631   %}
4632   ins_pipe( pipe_slow );
4633 %}
4634 
4635 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4636   predicate(UseAVX > 2);


4640             "vpaddd  $tmp,$src2,$tmp2\n\t"
4641             "pshufd  $tmp2,$tmp,0x1\n\t"
4642             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4643             "movd    $tmp2,$src1\n\t"
4644             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4645             "movd    $dst,$tmp2\t! add reduction4I" %}
4646   ins_encode %{
4647     int vector_len = 0;
4648     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4649     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4650     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4651     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4652     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4653     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4654     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4655   %}
4656   ins_pipe( pipe_slow );
4657 %}
4658 
4659 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4660   predicate(UseAVX > 0 && UseAVX < 3);
4661   match(Set dst (AddReductionVI src1 src2));
4662   effect(TEMP tmp, TEMP tmp2);
4663   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4664             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4665             "vextracti128  $tmp2,$tmp\n\t"
4666             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4667             "movd     $tmp2,$src1\n\t"
4668             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4669             "movd     $dst,$tmp2\t! add reduction8I" %}
4670   ins_encode %{
4671     int vector_len = 1;
4672     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4673     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4674     __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
4675     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4676     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4677     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4678     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4679   %}
4680   ins_pipe( pipe_slow );


4695             "movd    $dst,$tmp2\t! add reduction8I" %}
4696   ins_encode %{
4697     int vector_len = 0;
4698     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4699     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4700     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4701     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4702     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4703     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4704     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4705     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4706     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4712   predicate(UseAVX > 2);
4713   match(Set dst (AddReductionVI src1 src2));
4714   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4715   format %{ "vextracti64x4  $tmp3,$src2\n\t"
4716             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4717             "vextracti128   $tmp,$tmp3\n\t"
4718             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4719             "pshufd  $tmp2,$tmp,0xE\n\t"
4720             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4721             "pshufd  $tmp2,$tmp,0x1\n\t"
4722             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4723             "movd    $tmp2,$src1\n\t"
4724             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4725             "movd    $dst,$tmp2\t! mul reduction16I" %}
4726   ins_encode %{
4727     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister);
4728     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4729     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
4730     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4731     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4732     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4733     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4734     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4735     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4736     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4737     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4738   %}
4739   ins_pipe( pipe_slow );
4740 %}
4741 
4742 #ifdef _LP64
4743 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4744   predicate(UseAVX > 2);
4745   match(Set dst (AddReductionVL src1 src2));
4746   effect(TEMP tmp, TEMP tmp2);
4747   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4748             "vpaddq  $tmp,$src2,$tmp2\n\t"
4749             "movdq   $tmp2,$src1\n\t"
4750             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4751             "movdq   $dst,$tmp2\t! add reduction2L" %}
4752   ins_encode %{
4753     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4754     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4755     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4756     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4757     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4758   %}
4759   ins_pipe( pipe_slow );
4760 %}
4761 
4762 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4763   predicate(UseAVX > 2);
4764   match(Set dst (AddReductionVL src1 src2));
4765   effect(TEMP tmp, TEMP tmp2);
4766   format %{ "vextracti64x2  $tmp,$src2, 0x1\n\t"
4767             "vpaddq  $tmp2,$tmp,$src2\n\t"
4768             "pshufd  $tmp,$tmp2,0xE\n\t"
4769             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4770             "movdq   $tmp,$src1\n\t"
4771             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4772             "movdq   $dst,$tmp2\t! add reduction4L" %}
4773   ins_encode %{
4774     __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
4775     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4776     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4777     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4778     __ movdq($tmp$$XMMRegister, $src1$$Register);
4779     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4780     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4781   %}
4782   ins_pipe( pipe_slow );
4783 %}
4784 
4785 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4786   predicate(UseAVX > 2);
4787   match(Set dst (AddReductionVL src1 src2));
4788   effect(TEMP tmp, TEMP tmp2);
4789   format %{ "vextracti64x4  $tmp2,$src2\n\t"
4790             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4791             "vextracti128   $tmp,$tmp2\n\t"
4792             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4793             "pshufd  $tmp,$tmp2,0xE\n\t"
4794             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4795             "movdq   $tmp,$src1\n\t"
4796             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4797             "movdq   $dst,$tmp2\t! add reduction8L" %}
4798   ins_encode %{
4799     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister);
4800     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4801     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
4802     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4803     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4804     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4805     __ movdq($tmp$$XMMRegister, $src1$$Register);
4806     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4807     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 #endif
4812 
4813 instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
4814   predicate(UseSSE >= 1 && UseAVX == 0);
4815   match(Set dst (AddReductionVF src1 src2));
4816   effect(TEMP tmp, TEMP tmp2);
4817   format %{ "movdqu  $tmp,$src1\n\t"
4818             "addss   $tmp,$src2\n\t"
4819             "pshufd  $tmp2,$src2,0x01\n\t"
4820             "addss   $tmp,$tmp2\n\t"
4821             "movdqu  $dst,$tmp\t! add reduction2F" %}
4822   ins_encode %{
4823     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
4824     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
4825     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
4826     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4827     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
4828   %}
4829   ins_pipe( pipe_slow );
4830 %}
4831 
4832 instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
4833   predicate(UseAVX > 0);
4834   match(Set dst (AddReductionVF src1 src2));
4835   effect(TEMP tmp2, TEMP tmp);
4836   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4837             "pshufd  $tmp,$src2,0x01\n\t"
4838             "vaddss  $dst,$tmp2,$tmp\t! add reduction2F" %}
4839   ins_encode %{
4840     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4841     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4842     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4843   %}
4844   ins_pipe( pipe_slow );
4845 %}
4846 
4847 instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
4848   predicate(UseSSE >= 1 && UseAVX == 0);
4849   match(Set dst (AddReductionVF src1 src2));
4850   effect(TEMP tmp, TEMP tmp2);
4851   format %{ "movdqu  $tmp,$src1\n\t"
4852             "addss   $tmp,$src2\n\t"
4853             "pshufd  $tmp2,$src2,0x01\n\t"
4854             "addss   $tmp,$tmp2\n\t"
4855             "pshufd  $tmp2,$src2,0x02\n\t"
4856             "addss   $tmp,$tmp2\n\t"
4857             "pshufd  $tmp2,$src2,0x03\n\t"
4858             "addss   $tmp,$tmp2\n\t"
4859             "movdqu  $dst,$tmp\t! add reduction4F" %}
4860   ins_encode %{
4861     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
4862     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
4863     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
4864     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4865     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
4866     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4867     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
4868     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4869     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
4870   %}
4871   ins_pipe( pipe_slow );
4872 %}
4873 
4874 instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
4875   predicate(UseAVX > 0);
4876   match(Set dst (AddReductionVF src1 src2));
4877   effect(TEMP tmp, TEMP tmp2);
4878   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4879             "pshufd  $tmp,$src2,0x01\n\t"
4880             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4881             "pshufd  $tmp,$src2,0x02\n\t"
4882             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4883             "pshufd  $tmp,$src2,0x03\n\t"
4884             "vaddss  $dst,$tmp2,$tmp\t! add reduction4F" %}
4885   ins_encode %{
4886     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4887     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4888     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4889     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4890     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4891     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4892     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4893   %}
4894   ins_pipe( pipe_slow );
4895 %}
4896 
4897 instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
4898   predicate(UseAVX > 0);
4899   match(Set dst (AddReductionVF src1 src2));
4900   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4901   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4902             "pshufd  $tmp,$src2,0x01\n\t"
4903             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4904             "pshufd  $tmp,$src2,0x02\n\t"
4905             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4906             "pshufd  $tmp,$src2,0x03\n\t"
4907             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4908             "vextractf128  $tmp3,$src2\n\t"
4909             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4910             "pshufd  $tmp,$tmp3,0x01\n\t"
4911             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4912             "pshufd  $tmp,$tmp3,0x02\n\t"
4913             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4914             "pshufd  $tmp,$tmp3,0x03\n\t"
4915             "vaddss  $dst,$tmp2,$tmp\t! add reduction8F" %}
4916   ins_encode %{
4917     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4918     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4919     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4920     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4921     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4922     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4923     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4924     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
4925     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4926     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4927     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4928     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
4929     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4930     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
4931     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4932   %}
4933   ins_pipe( pipe_slow );
4934 %}
4935 
4936 instruct radd16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4937   predicate(UseAVX > 2);
4938   match(Set dst (AddReductionVF src1 src2));
4939   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4940   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4941             "pshufd  $tmp,$src2,0x01\n\t"
4942             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4943             "pshufd  $tmp,$src2,0x02\n\t"
4944             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4945             "pshufd  $tmp,$src2,0x03\n\t"
4946             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4947             "vextractf64x2  $tmp3,$src2, 0x1\n\t"
4948             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4949             "pshufd  $tmp,$tmp3,0x01\n\t"
4950             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4951             "pshufd  $tmp,$tmp3,0x02\n\t"
4952             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4953             "pshufd  $tmp,$tmp3,0x03\n\t"
4954             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4955             "vextractf64x2  $tmp3,$src2, 0x2\n\t"
4956             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4957             "pshufd  $tmp,$tmp3,0x01\n\t"
4958             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4959             "pshufd  $tmp,$tmp3,0x02\n\t"
4960             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4961             "pshufd  $tmp,$tmp3,0x03\n\t"
4962             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4963             "vextractf64x2  $tmp3,$src2, 0x3\n\t"
4964             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4965             "pshufd  $tmp,$tmp3,0x01\n\t"
4966             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4967             "pshufd  $tmp,$tmp3,0x02\n\t"
4968             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4969             "pshufd  $tmp,$tmp3,0x03\n\t"
4970             "vaddss  $dst,$tmp2,$tmp\t! add reduction16F" %}
4971   ins_encode %{
4972     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4973     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4974     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4975     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4976     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4977     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4978     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4979     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
4980     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4981     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4982     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4983     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
4984     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4985     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
4986     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4987     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
4988     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4989     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4990     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4991     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
4992     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4993     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
4994     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4995     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
4996     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4997     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4998     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4999     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5000     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5001     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5002     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5003   %}
5004   ins_pipe( pipe_slow );
5005 %}
5006 
5007 instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
5008   predicate(UseSSE >= 1 && UseAVX == 0);
5009   match(Set dst (AddReductionVD src1 src2));
5010   effect(TEMP tmp, TEMP dst);
5011   format %{ "movdqu  $tmp,$src1\n\t"
5012             "addsd   $tmp,$src2\n\t"
5013             "pshufd  $dst,$src2,0xE\n\t"
5014             "addsd   $dst,$tmp\t! add reduction2D" %}
5015   ins_encode %{
5016     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5017     __ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
5018     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
5019     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5020   %}
5021   ins_pipe( pipe_slow );
5022 %}
5023 
5024 instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
5025   predicate(UseAVX > 0);
5026   match(Set dst (AddReductionVD src1 src2));
5027   effect(TEMP tmp, TEMP tmp2);
5028   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
5029             "pshufd  $tmp,$src2,0xE\n\t"
5030             "vaddsd  $dst,$tmp2,$tmp\t! add reduction2D" %}
5031   ins_encode %{
5032     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5033     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5034     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5035   %}
5036   ins_pipe( pipe_slow );
5037 %}
5038 
5039 instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
5040   predicate(UseAVX > 0);
5041   match(Set dst (AddReductionVD src1 src2));
5042   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5043   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
5044             "pshufd  $tmp,$src2,0xE\n\t"
5045             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5046             "vextractf128  $tmp3,$src2\n\t"
5047             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5048             "pshufd  $tmp,$tmp3,0xE\n\t"
5049             "vaddsd  $dst,$tmp2,$tmp\t! add reduction4D" %}
5050   ins_encode %{
5051     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5052     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5053     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5054     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
5055     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5056     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5057     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5058   %}
5059   ins_pipe( pipe_slow );
5060 %}
5061 
5062 instruct rvadd8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{
5063   predicate(UseAVX > 2);
5064   match(Set dst (AddReductionVD src1 src2));
5065   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5066   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
5067             "pshufd  $tmp,$src2,0xE\n\t"
5068             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5069             "vextractf64x2  $tmp3,$src2, 0x1\n\t"
5070             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5071             "pshufd  $tmp,$tmp3,0xE\n\t"
5072             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5073             "vextractf64x2  $tmp3,$src2, 0x2\n\t"
5074             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5075             "pshufd  $tmp,$tmp3,0xE\n\t"
5076             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5077             "vextractf64x2  $tmp3,$src2, 0x3\n\t"
5078             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5079             "pshufd  $tmp,$tmp3,0xE\n\t"
5080             "vaddsd  $dst,$tmp2,$tmp\t! add reduction8D" %}
5081   ins_encode %{
5082     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5083     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5084     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5085     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
5086     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5087     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5088     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5089     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
5090     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5091     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5092     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5093     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
5094     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5095     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5096     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5097   %}
5098   ins_pipe( pipe_slow );
5099 %}
5100 
5101 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5102   predicate(UseSSE > 3 && UseAVX == 0);
5103   match(Set dst (MulReductionVI src1 src2));
5104   effect(TEMP tmp, TEMP tmp2);
5105   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5106             "pmulld  $tmp2,$src2\n\t"
5107             "movd    $tmp,$src1\n\t"
5108             "pmulld  $tmp2,$tmp\n\t"
5109             "movd    $dst,$tmp2\t! mul reduction2I" %}
5110   ins_encode %{
5111     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5112     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5113     __ movdl($tmp$$XMMRegister, $src1$$Register);
5114     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5115     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5116   %}


5199             "movd     $dst,$tmp2\t! mul reduction8I" %}
5200   ins_encode %{
5201     int vector_len = 0;
5202     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5203     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5204     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5205     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5206     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5207     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5208     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5209     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5210     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5211   %}
5212   ins_pipe( pipe_slow );
5213 %}
5214 
5215 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5216   predicate(UseAVX > 2);
5217   match(Set dst (MulReductionVI src1 src2));
5218   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5219   format %{ "vextracti64x4  $tmp3,$src2\n\t"
5220             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5221             "vextracti128   $tmp,$tmp3\n\t"
5222             "vpmulld  $tmp,$tmp,$src2\n\t"
5223             "pshufd   $tmp2,$tmp,0xE\n\t"
5224             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5225             "pshufd   $tmp2,$tmp,0x1\n\t"
5226             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5227             "movd     $tmp2,$src1\n\t"
5228             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5229             "movd     $dst,$tmp2\t! mul reduction16I" %}
5230   ins_encode %{
5231     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister);
5232     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5233     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
5234     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5235     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5236     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5237     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5238     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5239     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5240     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5241     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5242   %}
5243   ins_pipe( pipe_slow );
5244 %}
5245 
5246 #ifdef _LP64
5247 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5248   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5249   match(Set dst (MulReductionVL src1 src2));
5250   effect(TEMP tmp, TEMP tmp2);
5251   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5252             "vpmullq  $tmp,$src2,$tmp2\n\t"
5253             "movdq    $tmp2,$src1\n\t"
5254             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5255             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5256   ins_encode %{
5257     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5258     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5259     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5260     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5261     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5262   %}
5263   ins_pipe( pipe_slow );
5264 %}
5265 
5266 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5267   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5268   match(Set dst (MulReductionVL src1 src2));
5269   effect(TEMP tmp, TEMP tmp2);
5270   format %{ "vextracti64x2  $tmp,$src2, 0x1\n\t"
5271             "vpmullq  $tmp2,$tmp,$src2\n\t"
5272             "pshufd   $tmp,$tmp2,0xE\n\t"
5273             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5274             "movdq    $tmp,$src1\n\t"
5275             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5276             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5277   ins_encode %{
5278     __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
5279     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5280     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5281     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5282     __ movdq($tmp$$XMMRegister, $src1$$Register);
5283     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5284     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5285   %}
5286   ins_pipe( pipe_slow );
5287 %}
5288 
5289 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5290   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5291   match(Set dst (MulReductionVL src1 src2));
5292   effect(TEMP tmp, TEMP tmp2);
5293   format %{ "vextracti64x4  $tmp2,$src2\n\t"
5294             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5295             "vextracti128   $tmp,$tmp2\n\t"
5296             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5297             "pshufd   $tmp,$tmp2,0xE\n\t"
5298             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5299             "movdq    $tmp,$src1\n\t"
5300             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5301             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5302   ins_encode %{
5303     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister);
5304     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5305     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
5306     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5307     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5308     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5309     __ movdq($tmp$$XMMRegister, $src1$$Register);
5310     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5311     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5312   %}
5313   ins_pipe( pipe_slow );
5314 %}
5315 #endif
5316 
5317 instruct rsmul2F_reduction(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
5318   predicate(UseSSE >= 1 && UseAVX == 0);
5319   match(Set dst (MulReductionVF src1 src2));
5320   effect(TEMP tmp, TEMP tmp2);
5321   format %{ "movdqu  $tmp,$src1\n\t"
5322             "mulss   $tmp,$src2\n\t"
5323             "pshufd  $tmp2,$src2,0x01\n\t"
5324             "mulss   $tmp,$tmp2\n\t"
5325             "movdqu  $dst,$tmp\t! mul reduction2F" %}
5326   ins_encode %{
5327     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5328     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
5329     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
5330     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5331     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
5332   %}
5333   ins_pipe( pipe_slow );
5334 %}
5335 
5336 instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
5337   predicate(UseAVX > 0);
5338   match(Set dst (MulReductionVF src1 src2));
5339   effect(TEMP tmp, TEMP tmp2);
5340   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5341             "pshufd  $tmp,$src2,0x01\n\t"
5342             "vmulss  $dst,$tmp2,$tmp\t! mul reduction2F" %}
5343   ins_encode %{
5344     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5345     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5346     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5347   %}
5348   ins_pipe( pipe_slow );
5349 %}
5350 
5351 instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
5352   predicate(UseSSE >= 1 && UseAVX == 0);
5353   match(Set dst (MulReductionVF src1 src2));
5354   effect(TEMP tmp, TEMP tmp2);
5355   format %{ "movdqu  $tmp,$src1\n\t"
5356             "mulss   $tmp,$src2\n\t"
5357             "pshufd  $tmp2,$src2,0x01\n\t"
5358             "mulss   $tmp,$tmp2\n\t"
5359             "pshufd  $tmp2,$src2,0x02\n\t"
5360             "mulss   $tmp,$tmp2\n\t"
5361             "pshufd  $tmp2,$src2,0x03\n\t"
5362             "mulss   $tmp,$tmp2\n\t"
5363             "movdqu  $dst,$tmp\t! mul reduction4F" %}
5364   ins_encode %{
5365     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5366     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
5367     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
5368     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5369     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
5370     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5371     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
5372     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5373     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
5374   %}
5375   ins_pipe( pipe_slow );
5376 %}
5377 
5378 instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
5379   predicate(UseAVX > 0);
5380   match(Set dst (MulReductionVF src1 src2));
5381   effect(TEMP tmp, TEMP tmp2);
5382   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5383             "pshufd  $tmp,$src2,0x01\n\t"
5384             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5385             "pshufd  $tmp,$src2,0x02\n\t"
5386             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5387             "pshufd  $tmp,$src2,0x03\n\t"
5388             "vmulss  $dst,$tmp2,$tmp\t! mul reduction4F" %}
5389   ins_encode %{
5390     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5391     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5392     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5393     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5394     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5395     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5396     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5397   %}
5398   ins_pipe( pipe_slow );
5399 %}
5400 
5401 instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
5402   predicate(UseAVX > 0);
5403   match(Set dst (MulReductionVF src1 src2));
5404   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5405   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5406             "pshufd  $tmp,$src2,0x01\n\t"
5407             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5408             "pshufd  $tmp,$src2,0x02\n\t"
5409             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5410             "pshufd  $tmp,$src2,0x03\n\t"
5411             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5412             "vextractf128  $tmp3,$src2\n\t"
5413             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5414             "pshufd  $tmp,$tmp3,0x01\n\t"
5415             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5416             "pshufd  $tmp,$tmp3,0x02\n\t"
5417             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5418             "pshufd  $tmp,$tmp3,0x03\n\t"
5419             "vmulss  $dst,$tmp2,$tmp\t! mul reduction8F" %}
5420   ins_encode %{
5421     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5422     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5423     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5424     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5425     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5426     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5427     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5428     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
5429     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5430     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5431     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5432     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5433     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5434     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5435     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5436   %}
5437   ins_pipe( pipe_slow );
5438 %}
5439 
5440 instruct rvmul16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5441   predicate(UseAVX > 2);
5442   match(Set dst (MulReductionVF src1 src2));
5443   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5444   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5445             "pshufd  $tmp,$src2,0x01\n\t"
5446             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5447             "pshufd  $tmp,$src2,0x02\n\t"
5448             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5449             "pshufd  $tmp,$src2,0x03\n\t"
5450             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5451             "vextractf32x4  $tmp3,$src2, 0x1\n\t"
5452             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5453             "pshufd  $tmp,$tmp3,0x01\n\t"
5454             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5455             "pshufd  $tmp,$tmp3,0x02\n\t"
5456             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5457             "pshufd  $tmp,$tmp3,0x03\n\t"
5458             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5459             "vextractf32x4  $tmp3,$src2, 0x2\n\t"
5460             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5461             "pshufd  $tmp,$tmp3,0x01\n\t"
5462             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5463             "pshufd  $tmp,$tmp3,0x02\n\t"
5464             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5465             "pshufd  $tmp,$tmp3,0x03\n\t"
5466             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5467             "vextractf32x4  $tmp3,$src2, 0x3\n\t"
5468             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5469             "pshufd  $tmp,$tmp3,0x01\n\t"
5470             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5471             "pshufd  $tmp,$tmp3,0x02\n\t"
5472             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5473             "pshufd  $tmp,$tmp3,0x03\n\t"
5474             "vmulss  $dst,$tmp2,$tmp\t! mul reduction16F" %}
5475   ins_encode %{
5476     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5477     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5478     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5479     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5480     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5481     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5482     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5483     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
5484     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5485     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5486     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5487     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5488     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5489     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5490     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5491     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
5492     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5493     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5494     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5495     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5496     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5497     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5498     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5499     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
5500     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5501     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5502     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5503     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5504     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5505     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5506     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5507   %}
5508   ins_pipe( pipe_slow );
5509 %}
5510 
5511 instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
5512   predicate(UseSSE >= 1 && UseAVX == 0);
5513   match(Set dst (MulReductionVD src1 src2));
5514   effect(TEMP tmp, TEMP dst);
5515   format %{ "movdqu  $tmp,$src1\n\t"
5516             "mulsd   $tmp,$src2\n\t"
5517             "pshufd  $dst,$src2,0xE\n\t"
5518             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5519   ins_encode %{
5520     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5521     __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
5522     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
5523     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5524   %}
5525   ins_pipe( pipe_slow );
5526 %}
5527 
5528 instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
5529   predicate(UseAVX > 0);
5530   match(Set dst (MulReductionVD src1 src2));
5531   effect(TEMP tmp, TEMP tmp2);
5532   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
5533             "pshufd  $tmp,$src2,0xE\n\t"
5534             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction2D" %}
5535   ins_encode %{
5536     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5537     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5538     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5539   %}
5540   ins_pipe( pipe_slow );
5541 %}
5542 
5543 instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
5544   predicate(UseAVX > 0);
5545   match(Set dst (MulReductionVD src1 src2));
5546   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5547   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
5548             "pshufd  $tmp,$src2,0xE\n\t"
5549             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5550             "vextractf128  $tmp3,$src2\n\t"
5551             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5552             "pshufd  $tmp,$tmp3,0xE\n\t"
5553             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction4D" %}
5554   ins_encode %{
5555     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5556     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5557     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5558     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
5559     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5560     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5561     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5562   %}
5563   ins_pipe( pipe_slow );
5564 %}
5565 
5566 instruct rvmul8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{
5567   predicate(UseAVX > 2);
5568   match(Set dst (MulReductionVD src1 src2));
5569   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5570   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
5571             "pshufd  $tmp,$src2,0xE\n\t"
5572             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5573             "vextractf64x2  $tmp3,$src2, 0x1\n\t"
5574             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5575             "pshufd  $tmp,$src2,0xE\n\t"
5576             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5577             "vextractf64x2  $tmp3,$src2, 0x2\n\t"
5578             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5579             "pshufd  $tmp,$tmp3,0xE\n\t"
5580             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5581             "vextractf64x2  $tmp3,$src2, 0x3\n\t"
5582             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5583             "pshufd  $tmp,$tmp3,0xE\n\t"
5584             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction8D" %}
5585   ins_encode %{
5586     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5587     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5588     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5589     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
5590     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5591     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5592     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5593     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
5594     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5595     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5596     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5597     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
5598     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5599     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5600     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5601   %}
5602   ins_pipe( pipe_slow );
5603 %}
5604 
5605 // ====================VECTOR ARITHMETIC=======================================
5606 
5607 // --------------------------------- ADD --------------------------------------
5608 
5609 // Bytes vector add
5610 instruct vadd4B(vecS dst, vecS src) %{
5611   predicate(n->as_Vector()->length() == 4);
5612   match(Set dst (AddVB dst src));
5613   format %{ "paddb   $dst,$src\t! add packed4B" %}
5614   ins_encode %{
5615     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5616   %}
5617   ins_pipe( pipe_slow );
5618 %}
5619 
5620 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5621   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5622   match(Set dst (AddVB src1 src2));
5623   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5624   ins_encode %{
5625     int vector_len = 0;
5626     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5627   %}
5628   ins_pipe( pipe_slow );
5629 %}
5630 
5631 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
5632   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5633   match(Set dst (AddVB src (LoadVector mem)));
5634   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5635   ins_encode %{
5636     int vector_len = 0;
5637     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5638   %}
5639   ins_pipe( pipe_slow );
5640 %}
5641 
5642 instruct vadd8B(vecD dst, vecD src) %{
5643   predicate(n->as_Vector()->length() == 8);
5644   match(Set dst (AddVB dst src));
5645   format %{ "paddb   $dst,$src\t! add packed8B" %}

5646   ins_encode %{
5647     __ paddb($dst$$XMMRegister, $src$$XMMRegister);

5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
5653   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5654   match(Set dst (AddVB src1 src2));
5655   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5656   ins_encode %{
5657     int vector_len = 0;
5658     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5659   %}
5660   ins_pipe( pipe_slow );
5661 %}
5662 
5663 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
5664   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5665   match(Set dst (AddVB src (LoadVector mem)));
5666   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5667   ins_encode %{
5668     int vector_len = 0;
5669     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5670   %}
5671   ins_pipe( pipe_slow );
5672 %}
5673 
5674 instruct vadd16B(vecX dst, vecX src) %{
5675   predicate(n->as_Vector()->length() == 16);












5676   match(Set dst (AddVB dst src));
5677   format %{ "paddb   $dst,$src\t! add packed16B" %}
5678   ins_encode %{
5679     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5680   %}
5681   ins_pipe( pipe_slow );
5682 %}
5683 
5684 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
5685   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5686   match(Set dst (AddVB src1 src2));
5687   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5688   ins_encode %{
5689     int vector_len = 0;
5690     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5691   %}
5692   ins_pipe( pipe_slow );
5693 %}
5694 
5695 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
5696   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5697   match(Set dst (AddVB src (LoadVector mem)));
5698   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5699   ins_encode %{
5700     int vector_len = 0;
5701     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5702   %}
5703   ins_pipe( pipe_slow );
5704 %}
5705 
5706 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
5707   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
5708   match(Set dst (AddVB src1 src2));
5709   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}

5710   ins_encode %{
5711     int vector_len = 1;
5712     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5713   %}
5714   ins_pipe( pipe_slow );
5715 %}
5716 
5717 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
5718   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
5719   match(Set dst (AddVB src (LoadVector mem)));
5720   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5721   ins_encode %{
5722     int vector_len = 1;
5723     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5724   %}
5725   ins_pipe( pipe_slow );
5726 %}
5727 
5728 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5729   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
5730   match(Set dst (AddVB src1 src2));
5731   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5732   ins_encode %{
5733     int vector_len = 2;
5734     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5735   %}
5736   ins_pipe( pipe_slow );
5737 %}
5738 
5739 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5740   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
5741   match(Set dst (AddVB src (LoadVector mem)));
5742   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}

5743   ins_encode %{
5744     int vector_len = 2;
5745     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5746   %}
5747   ins_pipe( pipe_slow );
5748 %}
5749 
5750 // Shorts/Chars vector add
5751 instruct vadd2S(vecS dst, vecS src) %{
5752   predicate(n->as_Vector()->length() == 2);
5753   match(Set dst (AddVS dst src));
5754   format %{ "paddw   $dst,$src\t! add packed2S" %}
5755   ins_encode %{
5756     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5757   %}
5758   ins_pipe( pipe_slow );
5759 %}
5760 
5761 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
5762   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5763   match(Set dst (AddVS src1 src2));
5764   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5765   ins_encode %{
5766     int vector_len = 0;
5767     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5768   %}
5769   ins_pipe( pipe_slow );
5770 %}
5771 
5772 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
5773   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5774   match(Set dst (AddVS src (LoadVector mem)));
5775   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5776   ins_encode %{
5777     int vector_len = 0;
5778     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5779   %}
5780   ins_pipe( pipe_slow );
5781 %}
5782 
5783 instruct vadd4S(vecD dst, vecD src) %{
5784   predicate(n->as_Vector()->length() == 4);
5785   match(Set dst (AddVS dst src));
5786   format %{ "paddw   $dst,$src\t! add packed4S" %}

5787   ins_encode %{
5788     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

5789   %}
5790   ins_pipe( pipe_slow );
5791 %}
5792 
5793 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
5794   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5795   match(Set dst (AddVS src1 src2));
5796   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5797   ins_encode %{
5798     int vector_len = 0;
5799     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5800   %}
5801   ins_pipe( pipe_slow );
5802 %}
5803 
5804 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
5805   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5806   match(Set dst (AddVS src (LoadVector mem)));
5807   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5808   ins_encode %{
5809     int vector_len = 0;
5810     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5811   %}
5812   ins_pipe( pipe_slow );
5813 %}
5814 
5815 instruct vadd8S(vecX dst, vecX src) %{
5816   predicate(n->as_Vector()->length() == 8);
5817   match(Set dst (AddVS dst src));
5818   format %{ "paddw   $dst,$src\t! add packed8S" %}

5819   ins_encode %{
5820     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

5821   %}
5822   ins_pipe( pipe_slow );
5823 %}
5824 
5825 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
5826   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5827   match(Set dst (AddVS src1 src2));
5828   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5829   ins_encode %{
5830     int vector_len = 0;
5831     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
5837   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5838   match(Set dst (AddVS src (LoadVector mem)));
5839   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5840   ins_encode %{
5841     int vector_len = 0;
5842     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5843   %}
5844   ins_pipe( pipe_slow );
5845 %}
5846 
5847 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
5848   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5849   match(Set dst (AddVS src1 src2));
5850   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}

5851   ins_encode %{
5852     int vector_len = 1;
5853     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5854   %}
5855   ins_pipe( pipe_slow );
5856 %}
5857 
5858 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
5859   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5860   match(Set dst (AddVS src (LoadVector mem)));
5861   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
5862   ins_encode %{
5863     int vector_len = 1;
5864     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5865   %}
5866   ins_pipe( pipe_slow );
5867 %}
5868 
5869 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
5870   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
5871   match(Set dst (AddVS src1 src2));
5872   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
5873   ins_encode %{
5874     int vector_len = 2;
5875     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5876   %}
5877   ins_pipe( pipe_slow );
5878 %}
5879 
5880 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
5881   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
5882   match(Set dst (AddVS src (LoadVector mem)));
5883   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}

5884   ins_encode %{
5885     int vector_len = 2;
5886     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 // Integers vector add
5892 instruct vadd2I(vecD dst, vecD src) %{
5893   predicate(n->as_Vector()->length() == 2);
5894   match(Set dst (AddVI dst src));
5895   format %{ "paddd   $dst,$src\t! add packed2I" %}
5896   ins_encode %{
5897     __ paddd($dst$$XMMRegister, $src$$XMMRegister);

5898   %}
5899   ins_pipe( pipe_slow );
5900 %}
5901 
5902 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
5903   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5904   match(Set dst (AddVI src1 src2));
5905   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
5906   ins_encode %{
5907     int vector_len = 0;
5908     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5909   %}
5910   ins_pipe( pipe_slow );
5911 %}
5912 
5913 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
5914   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5915   match(Set dst (AddVI src (LoadVector mem)));
5916   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}

5917   ins_encode %{
5918     int vector_len = 0;
5919     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5920   %}
5921   ins_pipe( pipe_slow );
5922 %}
5923 
5924 instruct vadd4I(vecX dst, vecX src) %{
5925   predicate(n->as_Vector()->length() == 4);
5926   match(Set dst (AddVI dst src));
5927   format %{ "paddd   $dst,$src\t! add packed4I" %}
5928   ins_encode %{
5929     __ paddd($dst$$XMMRegister, $src$$XMMRegister);

5930   %}
5931   ins_pipe( pipe_slow );
5932 %}
5933 
5934 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
5935   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5936   match(Set dst (AddVI src1 src2));
5937   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
5938   ins_encode %{
5939     int vector_len = 0;
5940     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5941   %}
5942   ins_pipe( pipe_slow );
5943 %}
5944 
5945 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
5946   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5947   match(Set dst (AddVI src (LoadVector mem)));
5948   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}

5949   ins_encode %{
5950     int vector_len = 0;
5951     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5952   %}
5953   ins_pipe( pipe_slow );
5954 %}
5955 
5956 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
5957   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5958   match(Set dst (AddVI src1 src2));
5959   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
5960   ins_encode %{
5961     int vector_len = 1;
5962     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5963   %}
5964   ins_pipe( pipe_slow );
5965 %}
5966 
5967 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
5968   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5969   match(Set dst (AddVI src (LoadVector mem)));
5970   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
5971   ins_encode %{
5972     int vector_len = 1;
5973     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5974   %}
5975   ins_pipe( pipe_slow );
5976 %}
5977 
5978 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
5979   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
5980   match(Set dst (AddVI src1 src2));
5981   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}

5982   ins_encode %{
5983     int vector_len = 2;
5984     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5985   %}
5986   ins_pipe( pipe_slow );
5987 %}
5988 
5989 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
5990   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
5991   match(Set dst (AddVI src (LoadVector mem)));
5992   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
5993   ins_encode %{
5994     int vector_len = 2;
5995     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5996   %}
5997   ins_pipe( pipe_slow );
5998 %}
5999 
6000 // Longs vector add
6001 instruct vadd2L(vecX dst, vecX src) %{
6002   predicate(n->as_Vector()->length() == 2);
6003   match(Set dst (AddVL dst src));
6004   format %{ "paddq   $dst,$src\t! add packed2L" %}
6005   ins_encode %{
6006     __ paddq($dst$$XMMRegister, $src$$XMMRegister);

6007   %}
6008   ins_pipe( pipe_slow );
6009 %}
6010 
6011 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6012   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6013   match(Set dst (AddVL src1 src2));
6014   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6015   ins_encode %{
6016     int vector_len = 0;
6017     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6018   %}
6019   ins_pipe( pipe_slow );
6020 %}
6021 
6022 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6023   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6024   match(Set dst (AddVL src (LoadVector mem)));
6025   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}

6026   ins_encode %{
6027     int vector_len = 0;
6028     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6029   %}
6030   ins_pipe( pipe_slow );
6031 %}
6032 
6033 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6034   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6035   match(Set dst (AddVL src1 src2));
6036   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6037   ins_encode %{
6038     int vector_len = 1;
6039     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6040   %}
6041   ins_pipe( pipe_slow );
6042 %}
6043 
6044 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6045   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6046   match(Set dst (AddVL src (LoadVector mem)));
6047   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6048   ins_encode %{
6049     int vector_len = 1;
6050     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6051   %}
6052   ins_pipe( pipe_slow );
6053 %}
6054 
6055 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6056   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6057   match(Set dst (AddVL src1 src2));
6058   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}

6059   ins_encode %{
6060     int vector_len = 2;
6061     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6062   %}
6063   ins_pipe( pipe_slow );
6064 %}
6065 
6066 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6067   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6068   match(Set dst (AddVL src (LoadVector mem)));
6069   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6070   ins_encode %{
6071     int vector_len = 2;
6072     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6073   %}
6074   ins_pipe( pipe_slow );
6075 %}
6076 
6077 // Floats vector add
6078 instruct vadd2F(vecD dst, vecD src) %{
6079   predicate(n->as_Vector()->length() == 2);
6080   match(Set dst (AddVF dst src));
6081   format %{ "addps   $dst,$src\t! add packed2F" %}
6082   ins_encode %{
6083     __ addps($dst$$XMMRegister, $src$$XMMRegister);

6084   %}
6085   ins_pipe( pipe_slow );
6086 %}
6087 
6088 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6089   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6090   match(Set dst (AddVF src1 src2));
6091   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6092   ins_encode %{
6093     int vector_len = 0;
6094     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6095   %}
6096   ins_pipe( pipe_slow );
6097 %}
6098 
6099 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6100   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6101   match(Set dst (AddVF src (LoadVector mem)));
6102   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}

6103   ins_encode %{
6104     int vector_len = 0;
6105     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 instruct vadd4F(vecX dst, vecX src) %{
6111   predicate(n->as_Vector()->length() == 4);
6112   match(Set dst (AddVF dst src));
6113   format %{ "addps   $dst,$src\t! add packed4F" %}
6114   ins_encode %{
6115     __ addps($dst$$XMMRegister, $src$$XMMRegister);

6116   %}
6117   ins_pipe( pipe_slow );
6118 %}
6119 
6120 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6121   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6122   match(Set dst (AddVF src1 src2));
6123   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6124   ins_encode %{
6125     int vector_len = 0;
6126     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6127   %}
6128   ins_pipe( pipe_slow );
6129 %}
6130 
6131 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6132   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6133   match(Set dst (AddVF src (LoadVector mem)));
6134   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}

6135   ins_encode %{
6136     int vector_len = 0;
6137     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6138   %}
6139   ins_pipe( pipe_slow );
6140 %}
6141 
6142 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6143   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6144   match(Set dst (AddVF src1 src2));
6145   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6146   ins_encode %{
6147     int vector_len = 1;
6148     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6149   %}
6150   ins_pipe( pipe_slow );
6151 %}
6152 
6153 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6154   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6155   match(Set dst (AddVF src (LoadVector mem)));
6156   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6157   ins_encode %{
6158     int vector_len = 1;
6159     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6160   %}
6161   ins_pipe( pipe_slow );
6162 %}
6163 
6164 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6165   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6166   match(Set dst (AddVF src1 src2));
6167   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}














































6168   ins_encode %{
6169     int vector_len = 2;
6170     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6171   %}
6172   ins_pipe( pipe_slow );
6173 %}
6174 
6175 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6176   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6177   match(Set dst (AddVF src (LoadVector mem)));
6178   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6179   ins_encode %{
6180     int vector_len = 2;
6181     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6182   %}
6183   ins_pipe( pipe_slow );
6184 %}
6185 
6186 // Doubles vector add
6187 instruct vadd2D(vecX dst, vecX src) %{
6188   predicate(n->as_Vector()->length() == 2);
6189   match(Set dst (AddVD dst src));
6190   format %{ "addpd   $dst,$src\t! add packed2D" %}
6191   ins_encode %{
6192     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}
6196 
6197 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6198   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6199   match(Set dst (AddVD src1 src2));
6200   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6201   ins_encode %{
6202     int vector_len = 0;
6203     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6204   %}
6205   ins_pipe( pipe_slow );
6206 %}
6207 
6208 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6209   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6210   match(Set dst (AddVD src (LoadVector mem)));
6211   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6212   ins_encode %{
6213     int vector_len = 0;
6214     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6215   %}
6216   ins_pipe( pipe_slow );
6217 %}
6218 
6219 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{










6220   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6221   match(Set dst (AddVD src1 src2));
6222   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6223   ins_encode %{
6224     int vector_len = 1;
6225     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6226   %}
6227   ins_pipe( pipe_slow );
6228 %}
6229 
6230 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6231   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6232   match(Set dst (AddVD src (LoadVector mem)));
6233   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}











6234   ins_encode %{
6235     int vector_len = 1;
6236     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6237   %}
6238   ins_pipe( pipe_slow );
6239 %}
6240 
6241 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6242   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6243   match(Set dst (AddVD src1 src2));
6244   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6245   ins_encode %{
6246     int vector_len = 2;
6247     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6248   %}
6249   ins_pipe( pipe_slow );
6250 %}
6251 
6252 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6253   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6254   match(Set dst (AddVD src (LoadVector mem)));
6255   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6256   ins_encode %{
6257     int vector_len = 2;
6258     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6259   %}
6260   ins_pipe( pipe_slow );
6261 %}
6262 
6263 // --------------------------------- SUB --------------------------------------









6264 
6265 // Bytes vector sub
6266 instruct vsub4B(vecS dst, vecS src) %{
6267   predicate(n->as_Vector()->length() == 4);
6268   match(Set dst (SubVB dst src));
6269   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6270   ins_encode %{
6271     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6272   %}
6273   ins_pipe( pipe_slow );
6274 %}
6275 
6276 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6277   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6278   match(Set dst (SubVB src1 src2));
6279   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6280   ins_encode %{
6281     int vector_len = 0;
6282     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6283   %}
6284   ins_pipe( pipe_slow );
6285 %}
6286 
6287 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6288   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6289   match(Set dst (SubVB src (LoadVector mem)));
6290   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6291   ins_encode %{
6292     int vector_len = 0;
6293     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6294   %}
6295   ins_pipe( pipe_slow );
6296 %}
6297 
6298 instruct vsub8B(vecD dst, vecD src) %{
6299   predicate(n->as_Vector()->length() == 8);
6300   match(Set dst (SubVB dst src));
6301   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6302   ins_encode %{
6303     __ psubb($dst$$XMMRegister, $src$$XMMRegister);

6304   %}
6305   ins_pipe( pipe_slow );
6306 %}
6307 
6308 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6309   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6310   match(Set dst (SubVB src1 src2));
6311   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6312   ins_encode %{
6313     int vector_len = 0;
6314     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6315   %}
6316   ins_pipe( pipe_slow );
6317 %}
6318 
6319 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6320   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6321   match(Set dst (SubVB src (LoadVector mem)));
6322   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6323   ins_encode %{
6324     int vector_len = 0;
6325     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6326   %}
6327   ins_pipe( pipe_slow );
6328 %}
6329 
6330 instruct vsub16B(vecX dst, vecX src) %{
6331   predicate(n->as_Vector()->length() == 16);
6332   match(Set dst (SubVB dst src));
6333   format %{ "psubb   $dst,$src\t! sub packed16B" %}



























































































































































































































































































































































































































































































































































































































6334   ins_encode %{
6335     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6336   %}
6337   ins_pipe( pipe_slow );
6338 %}
6339 
6340 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6341   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6342   match(Set dst (SubVB src1 src2));
6343   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6344   ins_encode %{
6345     int vector_len = 0;
6346     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6347   %}
6348   ins_pipe( pipe_slow );
6349 %}
6350 
6351 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6352   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6353   match(Set dst (SubVB src (LoadVector mem)));
6354   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6355   ins_encode %{
6356     int vector_len = 0;
6357     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6358   %}
6359   ins_pipe( pipe_slow );
6360 %}
6361 
6362 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6363   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6364   match(Set dst (SubVB src1 src2));
6365   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}

6366   ins_encode %{
6367     int vector_len = 1;
6368     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6369   %}
6370   ins_pipe( pipe_slow );
6371 %}
6372 
6373 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6374   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6375   match(Set dst (SubVB src (LoadVector mem)));
6376   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6377   ins_encode %{
6378     int vector_len = 1;
6379     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6380   %}
6381   ins_pipe( pipe_slow );
6382 %}
6383 
6384 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6385   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
6386   match(Set dst (SubVB src1 src2));
6387   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6388   ins_encode %{
6389     int vector_len = 2;
6390     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6391   %}
6392   ins_pipe( pipe_slow );
6393 %}
6394 
6395 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6396   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
6397   match(Set dst (SubVB src (LoadVector mem)));
6398   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}

6399   ins_encode %{
6400     int vector_len = 2;
6401     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6402   %}
6403   ins_pipe( pipe_slow );
6404 %}
6405 
6406 // Shorts/Chars vector sub
6407 instruct vsub2S(vecS dst, vecS src) %{
6408   predicate(n->as_Vector()->length() == 2);
6409   match(Set dst (SubVS dst src));
6410   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6411   ins_encode %{
6412     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6413   %}
6414   ins_pipe( pipe_slow );
6415 %}
6416 
6417 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6418   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6419   match(Set dst (SubVS src1 src2));
6420   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6421   ins_encode %{
6422     int vector_len = 0;
6423     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6424   %}
6425   ins_pipe( pipe_slow );
6426 %}
6427 
6428 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6429   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6430   match(Set dst (SubVS src (LoadVector mem)));
6431   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6432   ins_encode %{
6433     int vector_len = 0;
6434     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6435   %}
6436   ins_pipe( pipe_slow );
6437 %}
6438 
6439 instruct vsub4S(vecD dst, vecD src) %{
6440   predicate(n->as_Vector()->length() == 4);
6441   match(Set dst (SubVS dst src));
6442   format %{ "psubw   $dst,$src\t! sub packed4S" %}

6443   ins_encode %{
6444     __ psubw($dst$$XMMRegister, $src$$XMMRegister);

6445   %}
6446   ins_pipe( pipe_slow );
6447 %}
6448 
6449 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6450   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6451   match(Set dst (SubVS src1 src2));
6452   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6453   ins_encode %{
6454     int vector_len = 0;
6455     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6456   %}
6457   ins_pipe( pipe_slow );
6458 %}
6459 
6460 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6461   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6462   match(Set dst (SubVS src (LoadVector mem)));
6463   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6464   ins_encode %{
6465     int vector_len = 0;
6466     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6467   %}
6468   ins_pipe( pipe_slow );
6469 %}
6470 
6471 instruct vsub8S(vecX dst, vecX src) %{
6472   predicate(n->as_Vector()->length() == 8);
6473   match(Set dst (SubVS dst src));
6474   format %{ "psubw   $dst,$src\t! sub packed8S" %}

6475   ins_encode %{
6476     __ psubw($dst$$XMMRegister, $src$$XMMRegister);

6477   %}
6478   ins_pipe( pipe_slow );
6479 %}
6480 
6481 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6482   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6483   match(Set dst (SubVS src1 src2));
6484   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6485   ins_encode %{
6486     int vector_len = 0;
6487     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6488   %}
6489   ins_pipe( pipe_slow );
6490 %}
6491 
6492 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6493   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6494   match(Set dst (SubVS src (LoadVector mem)));
6495   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6496   ins_encode %{
6497     int vector_len = 0;
6498     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6499   %}
6500   ins_pipe( pipe_slow );
6501 %}
6502 
6503 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6504   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6505   match(Set dst (SubVS src1 src2));

6506   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6507   ins_encode %{
6508     int vector_len = 1;
6509     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6510   %}
6511   ins_pipe( pipe_slow );
6512 %}
6513 
6514 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6515   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);











6516   match(Set dst (SubVS src (LoadVector mem)));
6517   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6518   ins_encode %{
6519     int vector_len = 1;
6520     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6521   %}
6522   ins_pipe( pipe_slow );
6523 %}
6524 












6525 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6526   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
6527   match(Set dst (SubVS src1 src2));
6528   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6529   ins_encode %{
6530     int vector_len = 2;
6531     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6532   %}
6533   ins_pipe( pipe_slow );
6534 %}
6535 
6536 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6537   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
6538   match(Set dst (SubVS src (LoadVector mem)));
6539   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6540   ins_encode %{
6541     int vector_len = 2;
6542     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6543   %}
6544   ins_pipe( pipe_slow );
6545 %}
6546 
6547 // Integers vector sub
6548 instruct vsub2I(vecD dst, vecD src) %{
6549   predicate(n->as_Vector()->length() == 2);
6550   match(Set dst (SubVI dst src));
6551   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6552   ins_encode %{
6553     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6554   %}
6555   ins_pipe( pipe_slow );
6556 %}
6557 


6866   match(Set dst (SubVD src (LoadVector mem)));
6867   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
6868   ins_encode %{
6869     int vector_len = 0;
6870     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6871   %}
6872   ins_pipe( pipe_slow );
6873 %}
6874 
6875 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
6876   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6877   match(Set dst (SubVD src1 src2));
6878   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
6879   ins_encode %{
6880     int vector_len = 1;
6881     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6882   %}
6883   ins_pipe( pipe_slow );
6884 %}
6885 
6886 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
6887   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6888   match(Set dst (SubVD src (LoadVector mem)));
6889   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}






























































































































































6890   ins_encode %{
6891     int vector_len = 1;
6892     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6893   %}
6894   ins_pipe( pipe_slow );
6895 %}
6896 
6897 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6898   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6899   match(Set dst (SubVD src1 src2));
6900   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
6901   ins_encode %{
6902     int vector_len = 2;
6903     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6904   %}
6905   ins_pipe( pipe_slow );
6906 %}
6907 
6908 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
6909   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6910   match(Set dst (SubVD src (LoadVector mem)));
6911   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}

6912   ins_encode %{
6913     int vector_len = 2;
6914     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6915   %}
6916   ins_pipe( pipe_slow );
6917 %}
6918 
6919 // --------------------------------- MUL --------------------------------------
6920 
6921 // Shorts/Chars vector mul
6922 instruct vmul2S(vecS dst, vecS src) %{
6923   predicate(n->as_Vector()->length() == 2);
6924   match(Set dst (MulVS dst src));
6925   format %{ "pmullw $dst,$src\t! mul packed2S" %}
6926   ins_encode %{
6927     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6928   %}
6929   ins_pipe( pipe_slow );
6930 %}
6931 
6932 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
6933   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6934   match(Set dst (MulVS src1 src2));
6935   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
6936   ins_encode %{
6937     int vector_len = 0;
6938     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6939   %}
6940   ins_pipe( pipe_slow );
6941 %}
6942 
6943 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
6944   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6945   match(Set dst (MulVS src (LoadVector mem)));
6946   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
6947   ins_encode %{
6948     int vector_len = 0;
6949     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6950   %}
6951   ins_pipe( pipe_slow );
6952 %}
6953 
6954 instruct vmul4S(vecD dst, vecD src) %{
6955   predicate(n->as_Vector()->length() == 4);
6956   match(Set dst (MulVS dst src));
6957   format %{ "pmullw  $dst,$src\t! mul packed4S" %}

6958   ins_encode %{
6959     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);

6960   %}
6961   ins_pipe( pipe_slow );
6962 %}
6963 
6964 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
6965   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6966   match(Set dst (MulVS src1 src2));
6967   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
6968   ins_encode %{
6969     int vector_len = 0;
6970     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6971   %}
6972   ins_pipe( pipe_slow );
6973 %}
6974 
6975 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
6976   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6977   match(Set dst (MulVS src (LoadVector mem)));
6978   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
6979   ins_encode %{
6980     int vector_len = 0;
6981     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6982   %}
6983   ins_pipe( pipe_slow );
6984 %}
6985 
6986 instruct vmul8S(vecX dst, vecX src) %{
6987   predicate(n->as_Vector()->length() == 8);
6988   match(Set dst (MulVS dst src));
6989   format %{ "pmullw  $dst,$src\t! mul packed8S" %}

6990   ins_encode %{
6991     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);

6992   %}
6993   ins_pipe( pipe_slow );
6994 %}
6995 
6996 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
6997   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6998   match(Set dst (MulVS src1 src2));
6999   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7000   ins_encode %{
7001     int vector_len = 0;
7002     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7003   %}
7004   ins_pipe( pipe_slow );
7005 %}
7006 
7007 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7008   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7009   match(Set dst (MulVS src (LoadVector mem)));
7010   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7011   ins_encode %{
7012     int vector_len = 0;
7013     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7014   %}
7015   ins_pipe( pipe_slow );
7016 %}
7017 
7018 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7019   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7020   match(Set dst (MulVS src1 src2));

7021   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7022   ins_encode %{
7023     int vector_len = 1;
7024     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7025   %}
7026   ins_pipe( pipe_slow );
7027 %}
7028 
7029 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7030   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);











7031   match(Set dst (MulVS src (LoadVector mem)));
7032   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7033   ins_encode %{
7034     int vector_len = 1;
7035     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7036   %}
7037   ins_pipe( pipe_slow );
7038 %}
7039 












7040 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7041   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7042   match(Set dst (MulVS src1 src2));
7043   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7044   ins_encode %{
7045     int vector_len = 2;
7046     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7047   %}
7048   ins_pipe( pipe_slow );
7049 %}
7050 
7051 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7052   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7053   match(Set dst (MulVS src (LoadVector mem)));
7054   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7055   ins_encode %{
7056     int vector_len = 2;
7057     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7058   %}
7059   ins_pipe( pipe_slow );
7060 %}
7061 
7062 // Integers vector mul (sse4_1)
7063 instruct vmul2I(vecD dst, vecD src) %{
7064   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7065   match(Set dst (MulVI dst src));
7066   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7067   ins_encode %{
7068     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7069   %}
7070   ins_pipe( pipe_slow );
7071 %}
7072 


7662   %}
7663   ins_pipe( pipe_slow );
7664 %}
7665 
7666 instruct vsqrt4D_reg(vecY dst, vecY src) %{
7667   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7668   match(Set dst (SqrtVD src));
7669   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
7670   ins_encode %{
7671     int vector_len = 1;
7672     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
7673   %}
7674   ins_pipe( pipe_slow );
7675 %}
7676 
7677 instruct vsqrt4D_mem(vecY dst, memory mem) %{
7678   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7679   match(Set dst (SqrtVD (LoadVector mem)));
7680   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
7681   ins_encode %{
7682     int vector_len = 1;
7683     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);


















































































































































































7684   %}
7685   ins_pipe( pipe_slow );
7686 %}
7687 
7688 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
7689   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7690   match(Set dst (SqrtVD src));
7691   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
7692   ins_encode %{
7693     int vector_len = 2;
7694     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
7695   %}
7696   ins_pipe( pipe_slow );
7697 %}
7698 
7699 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
7700   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7701   match(Set dst (SqrtVD (LoadVector mem)));
7702   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}

7703   ins_encode %{
7704     int vector_len = 2;
7705     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
7706   %}
7707   ins_pipe( pipe_slow );
7708 %}
7709 
7710 // ------------------------------ LeftShift -----------------------------------
7711 
7712 // Shorts/Chars vector left shift
7713 instruct vsll2S(vecS dst, vecS shift) %{
7714   predicate(n->as_Vector()->length() == 2);
7715   match(Set dst (LShiftVS dst shift));
7716   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
7717   ins_encode %{
7718     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
7719   %}
7720   ins_pipe( pipe_slow );
7721 %}
7722 
7723 instruct vsll2S_imm(vecS dst, immI8 shift) %{
7724   predicate(n->as_Vector()->length() == 2);
7725   match(Set dst (LShiftVS dst shift));
7726   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
7727   ins_encode %{
7728     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
7729   %}
7730   ins_pipe( pipe_slow );
7731 %}
7732 
7733 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
7734   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7735   match(Set dst (LShiftVS src shift));
7736   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
7737   ins_encode %{
7738     int vector_len = 0;
7739     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7740   %}
7741   ins_pipe( pipe_slow );
7742 %}
7743 
7744 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
7745   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7746   match(Set dst (LShiftVS src shift));
7747   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
7748   ins_encode %{
7749     int vector_len = 0;
7750     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7751   %}
7752   ins_pipe( pipe_slow );
7753 %}
7754 
7755 instruct vsll4S(vecD dst, vecS shift) %{
7756   predicate(n->as_Vector()->length() == 4);
7757   match(Set dst (LShiftVS dst shift));
7758   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}

7759   ins_encode %{
7760     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);

7761   %}
7762   ins_pipe( pipe_slow );
7763 %}
7764 
7765 instruct vsll4S_imm(vecD dst, immI8 shift) %{
7766   predicate(n->as_Vector()->length() == 4);
7767   match(Set dst (LShiftVS dst shift));
7768   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
7769   ins_encode %{
7770     __ psllw($dst$$XMMRegister, (int)$shift$$constant);

7771   %}
7772   ins_pipe( pipe_slow );
7773 %}
7774 
7775 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
7776   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7777   match(Set dst (LShiftVS src shift));
7778   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
7779   ins_encode %{
7780     int vector_len = 0;
7781     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7782   %}
7783   ins_pipe( pipe_slow );
7784 %}
7785 
7786 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
7787   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7788   match(Set dst (LShiftVS src shift));
7789   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}

7790   ins_encode %{
7791     int vector_len = 0;
7792     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7793   %}
7794   ins_pipe( pipe_slow );
7795 %}
7796 
7797 instruct vsll8S(vecX dst, vecS shift) %{
7798   predicate(n->as_Vector()->length() == 8);
7799   match(Set dst (LShiftVS dst shift));
7800   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
7801   ins_encode %{
7802     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);

7803   %}
7804   ins_pipe( pipe_slow );
7805 %}
7806 
7807 instruct vsll8S_imm(vecX dst, immI8 shift) %{
7808   predicate(n->as_Vector()->length() == 8);
7809   match(Set dst (LShiftVS dst shift));
7810   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
7811   ins_encode %{
7812     __ psllw($dst$$XMMRegister, (int)$shift$$constant);

7813   %}
7814   ins_pipe( pipe_slow );
7815 %}
7816 
7817 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
7818   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7819   match(Set dst (LShiftVS src shift));
7820   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}

7821   ins_encode %{
7822     int vector_len = 0;
7823     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7824   %}
7825   ins_pipe( pipe_slow );
7826 %}
7827 
7828 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
7829   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7830   match(Set dst (LShiftVS src shift));
7831   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
7832   ins_encode %{
7833     int vector_len = 0;
7834     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7835   %}
7836   ins_pipe( pipe_slow );
7837 %}
7838 
7839 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
7840   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7841   match(Set dst (LShiftVS src shift));
7842   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
7843   ins_encode %{
7844     int vector_len = 1;
7845     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7846   %}
7847   ins_pipe( pipe_slow );
7848 %}
7849 
7850 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
7851   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7852   match(Set dst (LShiftVS src shift));

7853   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
7854   ins_encode %{
7855     int vector_len = 1;
7856     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7857   %}
7858   ins_pipe( pipe_slow );
7859 %}
7860 
7861 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
7862   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7863   match(Set dst (LShiftVS src shift));
7864   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
7865   ins_encode %{
7866     int vector_len = 2;
7867     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7868   %}
7869   ins_pipe( pipe_slow );
7870 %}
7871 
7872 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
7873   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7874   match(Set dst (LShiftVS src shift));
7875   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
7876   ins_encode %{
7877     int vector_len = 2;
7878     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7879   %}
7880   ins_pipe( pipe_slow );
7881 %}
7882 
7883 // Integers vector left shift
7884 instruct vsll2I(vecD dst, vecS shift) %{
7885   predicate(n->as_Vector()->length() == 2);
7886   match(Set dst (LShiftVI dst shift));
7887   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
7888   ins_encode %{
7889     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
7890   %}
7891   ins_pipe( pipe_slow );
7892 %}
7893 


8062   %}
8063   ins_pipe( pipe_slow );
8064 %}
8065 
8066 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8067   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8068   match(Set dst (LShiftVL src shift));
8069   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8070   ins_encode %{
8071     int vector_len = 1;
8072     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8073   %}
8074   ins_pipe( pipe_slow );
8075 %}
8076 
8077 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8078   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8079   match(Set dst (LShiftVL src shift));
8080   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8081   ins_encode %{
8082     int vector_len = 2;
8083     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);






















































































































































































8084   %}
8085   ins_pipe( pipe_slow );
8086 %}
8087 
8088 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8089   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8090   match(Set dst (LShiftVL src shift));
8091   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}

8092   ins_encode %{
8093     int vector_len = 2;
8094     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8095   %}
8096   ins_pipe( pipe_slow );
8097 %}
8098 
8099 // ----------------------- LogicalRightShift -----------------------------------
8100 
8101 // Shorts vector logical right shift produces incorrect Java result
8102 // for negative data because java code convert short value into int with
8103 // sign extension before a shift. But char vectors are fine since chars are
8104 // unsigned values.
8105 
8106 instruct vsrl2S(vecS dst, vecS shift) %{
8107   predicate(n->as_Vector()->length() == 2);
8108   match(Set dst (URShiftVS dst shift));
8109   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8110   ins_encode %{
8111     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8112   %}
8113   ins_pipe( pipe_slow );
8114 %}
8115 
8116 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8117   predicate(n->as_Vector()->length() == 2);
8118   match(Set dst (URShiftVS dst shift));
8119   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8120   ins_encode %{
8121     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8122   %}
8123   ins_pipe( pipe_slow );
8124 %}
8125 
8126 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8127   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8128   match(Set dst (URShiftVS src shift));
8129   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8130   ins_encode %{
8131     int vector_len = 0;
8132     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8133   %}
8134   ins_pipe( pipe_slow );
8135 %}
8136 
8137 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8138   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8139   match(Set dst (URShiftVS src shift));
8140   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8141   ins_encode %{
8142     int vector_len = 0;
8143     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8144   %}
8145   ins_pipe( pipe_slow );
8146 %}
8147 
8148 instruct vsrl4S(vecD dst, vecS shift) %{
8149   predicate(n->as_Vector()->length() == 4);
8150   match(Set dst (URShiftVS dst shift));
8151   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}

8152   ins_encode %{
8153     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);

8154   %}
8155   ins_pipe( pipe_slow );
8156 %}
8157 
8158 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8159   predicate(n->as_Vector()->length() == 4);
8160   match(Set dst (URShiftVS dst shift));
8161   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8162   ins_encode %{
8163     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);

8164   %}
8165   ins_pipe( pipe_slow );
8166 %}
8167 
8168 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8169   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8170   match(Set dst (URShiftVS src shift));
8171   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8172   ins_encode %{
8173     int vector_len = 0;
8174     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8175   %}
8176   ins_pipe( pipe_slow );
8177 %}
8178 
8179 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8180   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8181   match(Set dst (URShiftVS src shift));
8182   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}

8183   ins_encode %{
8184     int vector_len = 0;
8185     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8186   %}
8187   ins_pipe( pipe_slow );
8188 %}
8189 
8190 instruct vsrl8S(vecX dst, vecS shift) %{
8191   predicate(n->as_Vector()->length() == 8);
8192   match(Set dst (URShiftVS dst shift));
8193   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8194   ins_encode %{
8195     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);

8196   %}
8197   ins_pipe( pipe_slow );
8198 %}
8199 
8200 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8201   predicate(n->as_Vector()->length() == 8);
8202   match(Set dst (URShiftVS dst shift));
8203   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8204   ins_encode %{
8205     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);

8206   %}
8207   ins_pipe( pipe_slow );
8208 %}
8209 
8210 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8211   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8212   match(Set dst (URShiftVS src shift));
8213   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}

8214   ins_encode %{
8215     int vector_len = 0;
8216     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8217   %}
8218   ins_pipe( pipe_slow );
8219 %}
8220 
8221 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8222   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8223   match(Set dst (URShiftVS src shift));
8224   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8225   ins_encode %{
8226     int vector_len = 0;
8227     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8228   %}
8229   ins_pipe( pipe_slow );
8230 %}
8231 
8232 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8233   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8234   match(Set dst (URShiftVS src shift));
8235   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8236   ins_encode %{
8237     int vector_len = 1;
8238     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8239   %}
8240   ins_pipe( pipe_slow );
8241 %}
8242 
8243 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8244   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8245   match(Set dst (URShiftVS src shift));

8246   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8247   ins_encode %{
8248     int vector_len = 1;
8249     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8250   %}
8251   ins_pipe( pipe_slow );
8252 %}
8253 
8254 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8255   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8256   match(Set dst (URShiftVS src shift));
8257   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8258   ins_encode %{
8259     int vector_len = 2;
8260     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8261   %}
8262   ins_pipe( pipe_slow );
8263 %}
8264 
8265 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8266   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8267   match(Set dst (URShiftVS src shift));
8268   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8269   ins_encode %{
8270     int vector_len = 2;
8271     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8272   %}
8273   ins_pipe( pipe_slow );
8274 %}
8275 
8276 // Integers vector logical right shift
8277 instruct vsrl2I(vecD dst, vecS shift) %{
8278   predicate(n->as_Vector()->length() == 2);
8279   match(Set dst (URShiftVI dst shift));
8280   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8281   ins_encode %{
8282     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8283   %}
8284   ins_pipe( pipe_slow );
8285 %}
8286 


8476     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8477   %}
8478   ins_pipe( pipe_slow );
8479 %}
8480 
8481 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8482   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8483   match(Set dst (URShiftVL src shift));
8484   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8485   ins_encode %{
8486     int vector_len = 2;
8487     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8488   %}
8489   ins_pipe( pipe_slow );
8490 %}
8491 
8492 // ------------------- ArithmeticRightShift -----------------------------------
8493 
8494 // Shorts/Chars vector arithmetic right shift
8495 instruct vsra2S(vecS dst, vecS shift) %{
8496   predicate(n->as_Vector()->length() == 2);
8497   match(Set dst (RShiftVS dst shift));
8498   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8499   ins_encode %{
8500     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8501   %}
8502   ins_pipe( pipe_slow );
8503 %}
8504 
8505 instruct vsra2S_imm(vecS dst, immI8 shift) %{
8506   predicate(n->as_Vector()->length() == 2);
8507   match(Set dst (RShiftVS dst shift));
8508   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8509   ins_encode %{
8510     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8511   %}
8512   ins_pipe( pipe_slow );
8513 %}
8514 
8515 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
8516   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8517   match(Set dst (RShiftVS src shift));
8518   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
8519   ins_encode %{
8520     int vector_len = 0;
8521     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8522   %}
8523   ins_pipe( pipe_slow );
8524 %}
8525 
8526 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8527   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);


































8528   match(Set dst (RShiftVS src shift));
8529   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
8530   ins_encode %{
8531     int vector_len = 0;
8532     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8533   %}
8534   ins_pipe( pipe_slow );
8535 %}
8536 












8537 instruct vsra4S(vecD dst, vecS shift) %{
8538   predicate(n->as_Vector()->length() == 4);
8539   match(Set dst (RShiftVS dst shift));
8540   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
8541   ins_encode %{
8542     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8543   %}
8544   ins_pipe( pipe_slow );
8545 %}
8546 
8547 instruct vsra4S_imm(vecD dst, immI8 shift) %{
8548   predicate(n->as_Vector()->length() == 4);
8549   match(Set dst (RShiftVS dst shift));
8550   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
8551   ins_encode %{
8552     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8553   %}
8554   ins_pipe( pipe_slow );
8555 %}
8556 
8557 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
8558   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8559   match(Set dst (RShiftVS src shift));
8560   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
8561   ins_encode %{
8562     int vector_len = 0;
8563     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8564   %}
8565   ins_pipe( pipe_slow );
8566 %}
8567 
8568 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8569   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);


































8570   match(Set dst (RShiftVS src shift));
8571   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
8572   ins_encode %{
8573     int vector_len = 0;
8574     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8575   %}
8576   ins_pipe( pipe_slow );
8577 %}
8578 












8579 instruct vsra8S(vecX dst, vecS shift) %{
8580   predicate(n->as_Vector()->length() == 8);
8581   match(Set dst (RShiftVS dst shift));
8582   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
8583   ins_encode %{
8584     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8585   %}
8586   ins_pipe( pipe_slow );
8587 %}
8588 
8589 instruct vsra8S_imm(vecX dst, immI8 shift) %{
8590   predicate(n->as_Vector()->length() == 8);
8591   match(Set dst (RShiftVS dst shift));
8592   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
8593   ins_encode %{
8594     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8595   %}
8596   ins_pipe( pipe_slow );
8597 %}
8598 
8599 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
8600   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8601   match(Set dst (RShiftVS src shift));
8602   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
8603   ins_encode %{
8604     int vector_len = 0;
8605     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8606   %}
8607   ins_pipe( pipe_slow );
8608 %}
8609 
8610 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8611   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);


































8612   match(Set dst (RShiftVS src shift));
8613   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
8614   ins_encode %{
8615     int vector_len = 0;
8616     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8617   %}
8618   ins_pipe( pipe_slow );
8619 %}
8620 
8621 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
8622   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);























8623   match(Set dst (RShiftVS src shift));
8624   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
8625   ins_encode %{
8626     int vector_len = 1;
8627     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8628   %}
8629   ins_pipe( pipe_slow );
8630 %}
8631 
8632 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8633   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);























8634   match(Set dst (RShiftVS src shift));
8635   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
8636   ins_encode %{
8637     int vector_len = 1;
8638     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8639   %}
8640   ins_pipe( pipe_slow );
8641 %}
8642 












8643 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
8644   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8645   match(Set dst (RShiftVS src shift));
8646   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
8647   ins_encode %{
8648     int vector_len = 2;
8649     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8650   %}
8651   ins_pipe( pipe_slow );
8652 %}
8653 
8654 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8655   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8656   match(Set dst (RShiftVS src shift));
8657   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
8658   ins_encode %{
8659     int vector_len = 2;
8660     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8661   %}
8662   ins_pipe( pipe_slow );
8663 %}
8664 
8665 // Integers vector arithmetic right shift
8666 instruct vsra2I(vecD dst, vecS shift) %{
8667   predicate(n->as_Vector()->length() == 2);
8668   match(Set dst (RShiftVI dst shift));
8669   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
8670   ins_encode %{
8671     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
8672   %}
8673   ins_pipe( pipe_slow );
8674 %}
8675 




1699     case Op_SqrtVD:
1700       if (UseAVX < 1) // enabled for AVX only
1701         ret_value = false;
1702       break;
1703     case Op_CompareAndSwapL:
1704 #ifdef _LP64
1705     case Op_CompareAndSwapP:
1706 #endif
1707       if (!VM_Version::supports_cx8())
1708         ret_value = false;
1709       break;
1710     case Op_CMoveVD:
1711       if (UseAVX < 1 || UseAVX > 2)
1712         ret_value = false;
1713       break;
1714   }
1715 
1716   return ret_value;  // Per default match rules are supported.
1717 }
1718 
1719 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1720   if (!has_match_rule(opcode))
1721     return false;
1722 
1723   // identify extra cases that we might want to provide match rules for
1724   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1725   bool ret_value = match_rule_supported(opcode);
1726   if (ret_value) {
1727     switch (opcode) {
1728       case Op_AddVB:
1729       case Op_SubVB:
1730         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1731           ret_value = false;
1732         break;
1733       case Op_URShiftVS:
1734       case Op_RShiftVS:
1735       case Op_LShiftVS:
1736       case Op_MulVS:
1737       case Op_AddVS:
1738       case Op_SubVS:
1739         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1740           ret_value = false;
1741         break;
1742       case Op_CMoveVD:
1743         if (vlen != 4)
1744           ret_value  = false;
1745         break;
1746     }
1747   }
1748 
1749   return ret_value;  // Per default match rules are supported.
1750 }
1751 
1752 const int Matcher::float_pressure(int default_pressure_threshold) {
1753   int float_pressure_threshold = default_pressure_threshold;
1754 #ifdef _LP64
1755   if (UseAVX > 2) {
1756     // Increase pressure threshold on machines with AVX3 which have
1757     // 2x more XMM registers.
1758     float_pressure_threshold = default_pressure_threshold * 2;
1759   }
1760 #endif
1761   return float_pressure_threshold;
1762 }
1763 
1764 // Max vector size in bytes. 0 if not supported.
1765 const int Matcher::vector_width_in_bytes(BasicType bt) {
1766   assert(is_java_primitive(bt), "only primitive type vectors");
1767   if (UseSSE < 2) return 0;
1768   // SSE2 supports 128bit vectors for all types.
1769   // AVX2 supports 256bit vectors for all types.
1770   // AVX2/EVEX supports 512bit vectors for all types.
1771   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;


1775   // Use flag to limit vector size.
1776   size = MIN2(size,(int)MaxVectorSize);
1777   // Minimum 2 values in vector (or 4 for bytes).
1778   switch (bt) {
1779   case T_DOUBLE:
1780   case T_LONG:
1781     if (size < 16) return 0;
1782     break;
1783   case T_FLOAT:
1784   case T_INT:
1785     if (size < 8) return 0;
1786     break;
1787   case T_BOOLEAN:
1788     if (size < 4) return 0;
1789     break;
1790   case T_CHAR:
1791     if (size < 4) return 0;
1792     break;
1793   case T_BYTE:
1794     if (size < 4) return 0;

1795     break;
1796   case T_SHORT:
1797     if (size < 4) return 0;

1798     break;
1799   default:
1800     ShouldNotReachHere();
1801   }
1802   return size;
1803 }
1804 
1805 // Limits on vector size (number of elements) loaded into vector.
1806 const int Matcher::max_vector_size(const BasicType bt) {
1807   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1808 }
1809 const int Matcher::min_vector_size(const BasicType bt) {
1810   int max_size = max_vector_size(bt);
1811   // Min size which can be loaded into vector is 4 bytes.
1812   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1813   return MIN2(size,max_size);
1814 }
1815 
1816 // Vector ideal reg corresponding to specidied size in bytes
1817 const int Matcher::vector_ideal_reg(int size) {


1981         break;
1982       case Op_VecD:
1983         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1984         break;
1985        case Op_VecX:
1986         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1987         break;
1988       case Op_VecY:
1989       case Op_VecZ:
1990         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1991         break;
1992       default:
1993         ShouldNotReachHere();
1994       }
1995     }
1996 #endif
1997   }
1998   bool is_single_byte = false;
1999   int vec_len = 0;
2000   if ((UseAVX > 2) && (stack_offset != 0)) {
2001     int tuple_type = Assembler::EVEX_FVM;
2002     int input_size = Assembler::EVEX_32bit;
2003     switch (ireg) {
2004     case Op_VecS:
2005       tuple_type = Assembler::EVEX_T1S;
2006       break;
2007     case Op_VecD:
2008       tuple_type = Assembler::EVEX_T1S;
2009       input_size = Assembler::EVEX_64bit;
2010       break;
2011     case Op_VecX:
2012       break;
2013     case Op_VecY:
2014       vec_len = 1;
2015       break;
2016     case Op_VecZ:
2017       vec_len = 2;
2018       break;
2019     }
2020     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
2021   }
2022   int offset_size = 0;
2023   int size = 5;
2024   if (UseAVX > 2 ) {
2025     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
2026       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2027       size += 2; // Need an additional two bytes for EVEX encoding
2028     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
2029       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2030     } else {
2031       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2032       size += 2; // Need an additional two bytes for EVEX encodding
2033     }
2034   } else {
2035     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2036   }
2037   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2038   return size+offset_size;
2039 }
2040 
2041 static inline jfloat replicate4_imm(int con, int width) {
2042   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2043   assert(width == 1 || width == 2, "only byte or short types here");
2044   int bit_width = width * 8;
2045   jint val = con;
2046   val &= (1 << bit_width) - 1;  // mask off sign bits
2047   while(bit_width < 32) {
2048     val |= (val << bit_width);


2732   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2733   ins_cost(150);
2734   ins_encode %{
2735     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2736   %}
2737   ins_pipe(pipe_slow);
2738 %}
2739 
2740 instruct absF_reg(regF dst) %{
2741   predicate((UseSSE>=1) && (UseAVX == 0));
2742   match(Set dst (AbsF dst));
2743   ins_cost(150);
2744   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2745   ins_encode %{
2746     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2747   %}
2748   ins_pipe(pipe_slow);
2749 %}
2750 
2751 instruct absF_reg_reg(regF dst, regF src) %{
2752   predicate(VM_Version::supports_avx256only());
2753   match(Set dst (AbsF src));
2754   ins_cost(150);
2755   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2756   ins_encode %{
2757     int vector_len = 0;
2758     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2759               ExternalAddress(float_signmask()), vector_len);
2760   %}
2761   ins_pipe(pipe_slow);
2762 %}
2763 
2764 #ifdef _LP64
2765 instruct absF_reg_reg_evex(regF dst, regF src) %{
2766   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2767   match(Set dst (AbsF src));
2768   ins_cost(150);
2769   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2770   ins_encode %{
2771     int vector_len = 0;
2772     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2773               ExternalAddress(float_signmask()), vector_len);
2774   %}
2775   ins_pipe(pipe_slow);
2776 %}
2777 
2778 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2779   predicate(VM_Version::supports_avx512novl());
2780   match(Set dst (AbsF src1));
2781   effect(TEMP src2);
2782   ins_cost(150);
2783   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2784   ins_encode %{
2785     int vector_len = 0;
2786     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2787               ExternalAddress(float_signmask()), vector_len);
2788   %}
2789   ins_pipe(pipe_slow);
2790 %}
2791 #else // _LP64
2792 instruct absF_reg_reg_evex(regF dst, regF src) %{
2793   predicate(UseAVX > 2);
2794   match(Set dst (AbsF src));
2795   ins_cost(150);
2796   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2797   ins_encode %{
2798     int vector_len = 0;
2799     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2800               ExternalAddress(float_signmask()), vector_len);
2801   %}
2802   ins_pipe(pipe_slow);
2803 %}
2804 #endif
2805 
2806 instruct absD_reg(regD dst) %{
2807   predicate((UseSSE>=2) && (UseAVX == 0));
2808   match(Set dst (AbsD dst));
2809   ins_cost(150);
2810   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2811             "# abs double by sign masking" %}
2812   ins_encode %{
2813     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2814   %}
2815   ins_pipe(pipe_slow);
2816 %}
2817 
2818 instruct absD_reg_reg(regD dst, regD src) %{
2819   predicate(VM_Version::supports_avx256only());
2820   match(Set dst (AbsD src));
2821   ins_cost(150);
2822   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2823             "# abs double by sign masking" %}
2824   ins_encode %{
2825     int vector_len = 0;
2826     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2827               ExternalAddress(double_signmask()), vector_len);
2828   %}
2829   ins_pipe(pipe_slow);
2830 %}
2831 
2832 #ifdef _LP64
2833 instruct absD_reg_reg_evex(regD dst, regD src) %{
2834   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2835   match(Set dst (AbsD src));
2836   ins_cost(150);
2837   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2838             "# abs double by sign masking" %}
2839   ins_encode %{
2840     int vector_len = 0;
2841     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2842               ExternalAddress(double_signmask()), vector_len);
2843   %}
2844   ins_pipe(pipe_slow);
2845 %}
2846 
2847 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2848   predicate(VM_Version::supports_avx512novl());
2849   match(Set dst (AbsD src1));
2850   effect(TEMP src2);
2851   ins_cost(150);
2852   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2853   ins_encode %{
2854     int vector_len = 0;
2855     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2856               ExternalAddress(double_signmask()), vector_len);
2857   %}
2858   ins_pipe(pipe_slow);
2859 %}
2860 #else // _LP64
2861 instruct absD_reg_reg_evex(regD dst, regD src) %{
2862   predicate(UseAVX > 2);
2863   match(Set dst (AbsD src));
2864   ins_cost(150);
2865   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2866             "# abs double by sign masking" %}
2867   ins_encode %{
2868     int vector_len = 0;
2869     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2870               ExternalAddress(double_signmask()), vector_len);
2871   %}
2872   ins_pipe(pipe_slow);
2873 %}
2874 #endif
2875 
2876 instruct negF_reg(regF dst) %{
2877   predicate((UseSSE>=1) && (UseAVX == 0));
2878   match(Set dst (NegF dst));
2879   ins_cost(150);
2880   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2881   ins_encode %{
2882     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2883   %}
2884   ins_pipe(pipe_slow);
2885 %}
2886 
2887 instruct negF_reg_reg(regF dst, regF src) %{
2888   predicate(UseAVX > 0);
2889   match(Set dst (NegF src));
2890   ins_cost(150);
2891   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2892   ins_encode %{
2893     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2894                  ExternalAddress(float_signflip()));


4661 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4662   predicate(UseSSE > 2 && UseAVX == 0);
4663   match(Set dst (AddReductionVI src1 src2));
4664   effect(TEMP tmp2, TEMP tmp);
4665   format %{ "movdqu  $tmp2,$src2\n\t"
4666             "phaddd  $tmp2,$tmp2\n\t"
4667             "movd    $tmp,$src1\n\t"
4668             "paddd   $tmp,$tmp2\n\t"
4669             "movd    $dst,$tmp\t! add reduction2I" %}
4670   ins_encode %{
4671     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4672     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4673     __ movdl($tmp$$XMMRegister, $src1$$Register);
4674     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4675     __ movdl($dst$$Register, $tmp$$XMMRegister);
4676   %}
4677   ins_pipe( pipe_slow );
4678 %}
4679 
4680 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4681   predicate(VM_Version::supports_avx256only());
4682   match(Set dst (AddReductionVI src1 src2));
4683   effect(TEMP tmp, TEMP tmp2);
4684   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4685             "movd     $tmp2,$src1\n\t"
4686             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4687             "movd     $dst,$tmp2\t! add reduction2I" %}
4688   ins_encode %{
4689     int vector_len = 0;
4690     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4691     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4692     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4693     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4694   %}
4695   ins_pipe( pipe_slow );
4696 %}
4697 
4698 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4699   predicate(UseAVX > 2);
4700   match(Set dst (AddReductionVI src1 src2));
4701   effect(TEMP tmp, TEMP tmp2);


4720   match(Set dst (AddReductionVI src1 src2));
4721   effect(TEMP tmp2, TEMP tmp);
4722   format %{ "movdqu  $tmp2,$src2\n\t"
4723             "phaddd  $tmp2,$tmp2\n\t"
4724             "phaddd  $tmp2,$tmp2\n\t"
4725             "movd    $tmp,$src1\n\t"
4726             "paddd   $tmp,$tmp2\n\t"
4727             "movd    $dst,$tmp\t! add reduction4I" %}
4728   ins_encode %{
4729     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4730     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4731     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4732     __ movdl($tmp$$XMMRegister, $src1$$Register);
4733     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4734     __ movdl($dst$$Register, $tmp$$XMMRegister);
4735   %}
4736   ins_pipe( pipe_slow );
4737 %}
4738 
4739 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4740   predicate(VM_Version::supports_avx256only());
4741   match(Set dst (AddReductionVI src1 src2));
4742   effect(TEMP tmp, TEMP tmp2);
4743   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4744             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4745             "movd     $tmp2,$src1\n\t"
4746             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4747             "movd     $dst,$tmp2\t! add reduction4I" %}
4748   ins_encode %{
4749     int vector_len = 0;
4750     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4751     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4752     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4753     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4754     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4755   %}
4756   ins_pipe( pipe_slow );
4757 %}
4758 
4759 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4760   predicate(UseAVX > 2);


4764             "vpaddd  $tmp,$src2,$tmp2\n\t"
4765             "pshufd  $tmp2,$tmp,0x1\n\t"
4766             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4767             "movd    $tmp2,$src1\n\t"
4768             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4769             "movd    $dst,$tmp2\t! add reduction4I" %}
4770   ins_encode %{
4771     int vector_len = 0;
4772     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4773     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4774     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4775     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4776     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4777     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4778     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4779   %}
4780   ins_pipe( pipe_slow );
4781 %}
4782 
4783 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4784   predicate(VM_Version::supports_avx256only());
4785   match(Set dst (AddReductionVI src1 src2));
4786   effect(TEMP tmp, TEMP tmp2);
4787   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4788             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4789             "vextracti128  $tmp2,$tmp\n\t"
4790             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4791             "movd     $tmp2,$src1\n\t"
4792             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4793             "movd     $dst,$tmp2\t! add reduction8I" %}
4794   ins_encode %{
4795     int vector_len = 1;
4796     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4797     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4798     __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
4799     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4800     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4801     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4802     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4803   %}
4804   ins_pipe( pipe_slow );


4819             "movd    $dst,$tmp2\t! add reduction8I" %}
4820   ins_encode %{
4821     int vector_len = 0;
4822     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4823     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4824     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4825     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4826     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4827     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4828     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4829     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4830     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4831   %}
4832   ins_pipe( pipe_slow );
4833 %}
4834 
4835 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4836   predicate(UseAVX > 2);
4837   match(Set dst (AddReductionVI src1 src2));
4838   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4839   format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
4840             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4841             "vextracti128   $tmp,$tmp3\n\t"
4842             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4843             "pshufd  $tmp2,$tmp,0xE\n\t"
4844             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4845             "pshufd  $tmp2,$tmp,0x1\n\t"
4846             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4847             "movd    $tmp2,$src1\n\t"
4848             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4849             "movd    $dst,$tmp2\t! mul reduction16I" %}
4850   ins_encode %{
4851     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4852     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4853     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
4854     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4855     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4856     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4857     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4858     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4859     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4860     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4861     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4862   %}
4863   ins_pipe( pipe_slow );
4864 %}
4865 
4866 #ifdef _LP64
4867 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4868   predicate(UseAVX > 2);
4869   match(Set dst (AddReductionVL src1 src2));
4870   effect(TEMP tmp, TEMP tmp2);
4871   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4872             "vpaddq  $tmp,$src2,$tmp2\n\t"
4873             "movdq   $tmp2,$src1\n\t"
4874             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4875             "movdq   $dst,$tmp2\t! add reduction2L" %}
4876   ins_encode %{
4877     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4878     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4879     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4880     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4881     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4882   %}
4883   ins_pipe( pipe_slow );
4884 %}
4885 
4886 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4887   predicate(UseAVX > 2);
4888   match(Set dst (AddReductionVL src1 src2));
4889   effect(TEMP tmp, TEMP tmp2);
4890   format %{ "vextracti128  $tmp,$src2\n\t"
4891             "vpaddq  $tmp2,$tmp,$src2\n\t"
4892             "pshufd  $tmp,$tmp2,0xE\n\t"
4893             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4894             "movdq   $tmp,$src1\n\t"
4895             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4896             "movdq   $dst,$tmp2\t! add reduction4L" %}
4897   ins_encode %{
4898     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4899     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4900     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4901     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4902     __ movdq($tmp$$XMMRegister, $src1$$Register);
4903     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4904     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4905   %}
4906   ins_pipe( pipe_slow );
4907 %}
4908 
4909 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4910   predicate(UseAVX > 2);
4911   match(Set dst (AddReductionVL src1 src2));
4912   effect(TEMP tmp, TEMP tmp2);
4913   format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
4914             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4915             "vextracti128   $tmp,$tmp2\n\t"
4916             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4917             "pshufd  $tmp,$tmp2,0xE\n\t"
4918             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4919             "movdq   $tmp,$src1\n\t"
4920             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4921             "movdq   $dst,$tmp2\t! add reduction8L" %}
4922   ins_encode %{
4923     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4924     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4925     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
4926     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4927     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4928     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4929     __ movdq($tmp$$XMMRegister, $src1$$Register);
4930     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4931     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4932   %}
4933   ins_pipe( pipe_slow );
4934 %}
4935 #endif
4936 
4937 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4938   predicate(UseSSE >= 1 && UseAVX == 0);
4939   match(Set dst (AddReductionVF dst src2));
4940   effect(TEMP dst, TEMP tmp);
4941   format %{ "addss   $dst,$src2\n\t"
4942             "pshufd  $tmp,$src2,0x01\n\t"
4943             "addss   $dst,$tmp\t! add reduction2F" %}


4944   ins_encode %{
4945     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4946     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4947     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);


4948   %}
4949   ins_pipe( pipe_slow );
4950 %}
4951 
4952 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4953   predicate(UseAVX > 0);
4954   match(Set dst (AddReductionVF dst src2));
4955   effect(TEMP dst, TEMP tmp);
4956   format %{ "vaddss  $dst,$dst,$src2\n\t"
4957             "pshufd  $tmp,$src2,0x01\n\t"
4958             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4959   ins_encode %{
4960     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4961     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4962     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4963   %}
4964   ins_pipe( pipe_slow );
4965 %}
4966 
4967 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4968   predicate(UseSSE >= 1 && UseAVX == 0);
4969   match(Set dst (AddReductionVF dst src2));
4970   effect(TEMP dst, TEMP tmp);
4971   format %{ "addss   $dst,$src2\n\t"
4972             "pshufd  $tmp,$src2,0x01\n\t"
4973             "addss   $dst,$tmp\n\t"
4974             "pshufd  $tmp,$src2,0x02\n\t"
4975             "addss   $dst,$tmp\n\t"
4976             "pshufd  $tmp,$src2,0x03\n\t"
4977             "addss   $dst,$tmp\t! add reduction4F" %}
4978   ins_encode %{
4979     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4980     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4981     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4982     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4983     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4984     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4985     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);




4986   %}
4987   ins_pipe( pipe_slow );
4988 %}
4989 
4990 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4991   predicate(UseAVX > 0);
4992   match(Set dst (AddReductionVF dst src2));
4993   effect(TEMP tmp, TEMP dst);
4994   format %{ "vaddss  $dst,dst,$src2\n\t"
4995             "pshufd  $tmp,$src2,0x01\n\t"
4996             "vaddss  $dst,$dst,$tmp\n\t"
4997             "pshufd  $tmp,$src2,0x02\n\t"
4998             "vaddss  $dst,$dst,$tmp\n\t"
4999             "pshufd  $tmp,$src2,0x03\n\t"
5000             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5001   ins_encode %{
5002     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5003     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5004     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5005     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5006     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5007     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5008     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5009   %}
5010   ins_pipe( pipe_slow );
5011 %}
5012 
5013 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5014   predicate(UseAVX > 0);
5015   match(Set dst (AddReductionVF dst src2));
5016   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5017   format %{ "vaddss  $dst,$dst,$src2\n\t"
5018             "pshufd  $tmp,$src2,0x01\n\t"
5019             "vaddss  $dst,$dst,$tmp\n\t"
5020             "pshufd  $tmp,$src2,0x02\n\t"
5021             "vaddss  $dst,$dst,$tmp\n\t"
5022             "pshufd  $tmp,$src2,0x03\n\t"
5023             "vaddss  $dst,$dst,$tmp\n\t"
5024             "vextractf128  $tmp2,$src2\n\t"
5025             "vaddss  $dst,$dst,$tmp2\n\t"
5026             "pshufd  $tmp,$tmp2,0x01\n\t"
5027             "vaddss  $dst,$dst,$tmp\n\t"
5028             "pshufd  $tmp,$tmp2,0x02\n\t"
5029             "vaddss  $dst,$dst,$tmp\n\t"
5030             "pshufd  $tmp,$tmp2,0x03\n\t"
5031             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5032   ins_encode %{
5033     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5034     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5035     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5036     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5037     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5038     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5039     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5040     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5041     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5042     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5043     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5044     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5045     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5046     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5047     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5048   %}
5049   ins_pipe( pipe_slow );
5050 %}
5051 
5052 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5053   predicate(UseAVX > 2);
5054   match(Set dst (AddReductionVF dst src2));
5055   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5056   format %{ "vaddss  $dst,$dst,$src2\n\t"
5057             "pshufd  $tmp,$src2,0x01\n\t"
5058             "vaddss  $dst,$dst,$tmp\n\t"
5059             "pshufd  $tmp,$src2,0x02\n\t"
5060             "vaddss  $dst,$dst,$tmp\n\t"
5061             "pshufd  $tmp,$src2,0x03\n\t"
5062             "vaddss  $dst,$dst,$tmp\n\t"
5063             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5064             "vaddss  $dst,$dst,$tmp2\n\t"
5065             "pshufd  $tmp,$tmp2,0x01\n\t"
5066             "vaddss  $dst,$dst,$tmp\n\t"
5067             "pshufd  $tmp,$tmp2,0x02\n\t"
5068             "vaddss  $dst,$dst,$tmp\n\t"
5069             "pshufd  $tmp,$tmp2,0x03\n\t"
5070             "vaddss  $dst,$dst,$tmp\n\t"
5071             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5072             "vaddss  $dst,$dst,$tmp2\n\t"
5073             "pshufd  $tmp,$tmp2,0x01\n\t"
5074             "vaddss  $dst,$dst,$tmp\n\t"
5075             "pshufd  $tmp,$tmp2,0x02\n\t"
5076             "vaddss  $dst,$dst,$tmp\n\t"
5077             "pshufd  $tmp,$tmp2,0x03\n\t"
5078             "vaddss  $dst,$dst,$tmp\n\t"
5079             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5080             "vaddss  $dst,$dst,$tmp2\n\t"
5081             "pshufd  $tmp,$tmp2,0x01\n\t"
5082             "vaddss  $dst,$dst,$tmp\n\t"
5083             "pshufd  $tmp,$tmp2,0x02\n\t"
5084             "vaddss  $dst,$dst,$tmp\n\t"
5085             "pshufd  $tmp,$tmp2,0x03\n\t"
5086             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5087   ins_encode %{
5088     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5089     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5090     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5091     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5092     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5093     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5094     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5095     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5096     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5097     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5098     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5099     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5100     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5101     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5102     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5103     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5104     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5105     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5106     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5107     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5108     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5109     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5110     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5111     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5112     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5113     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5114     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5115     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5116     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5117     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5118     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5119   %}
5120   ins_pipe( pipe_slow );
5121 %}
5122 
5123 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5124   predicate(UseSSE >= 1 && UseAVX == 0);
5125   match(Set dst (AddReductionVD dst src2));
5126   effect(TEMP tmp, TEMP dst);
5127   format %{ "addsd   $dst,$src2\n\t"
5128             "pshufd  $tmp,$src2,0xE\n\t"

5129             "addsd   $dst,$tmp\t! add reduction2D" %}
5130   ins_encode %{
5131     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5132     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);

5133     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5134   %}
5135   ins_pipe( pipe_slow );
5136 %}
5137 
5138 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5139   predicate(UseAVX > 0);
5140   match(Set dst (AddReductionVD dst src2));
5141   effect(TEMP tmp, TEMP dst);
5142   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5143             "pshufd  $tmp,$src2,0xE\n\t"
5144             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5145   ins_encode %{
5146     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5147     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5148     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5149   %}
5150   ins_pipe( pipe_slow );
5151 %}
5152 
5153 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5154   predicate(UseAVX > 0);
5155   match(Set dst (AddReductionVD dst src2));
5156   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5157   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5158             "pshufd  $tmp,$src2,0xE\n\t"
5159             "vaddsd  $dst,$dst,$tmp\n\t"
5160             "vextractf32x4h  $tmp2,$src2, 0x1\n\t"
5161             "vaddsd  $dst,$dst,$tmp2\n\t"
5162             "pshufd  $tmp,$tmp2,0xE\n\t"
5163             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5164   ins_encode %{
5165     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5166     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5167     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5168     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5169     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5170     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5171     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5172   %}
5173   ins_pipe( pipe_slow );
5174 %}
5175 
5176 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5177   predicate(UseAVX > 2);
5178   match(Set dst (AddReductionVD dst src2));
5179   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5180   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5181             "pshufd  $tmp,$src2,0xE\n\t"
5182             "vaddsd  $dst,$dst,$tmp\n\t"
5183             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5184             "vaddsd  $dst,$dst,$tmp2\n\t"
5185             "pshufd  $tmp,$tmp2,0xE\n\t"
5186             "vaddsd  $dst,$dst,$tmp\n\t"
5187             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5188             "vaddsd  $dst,$dst,$tmp2\n\t"
5189             "pshufd  $tmp,$tmp2,0xE\n\t"
5190             "vaddsd  $dst,$dst,$tmp\n\t"
5191             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5192             "vaddsd  $dst,$dst,$tmp2\n\t"
5193             "pshufd  $tmp,$tmp2,0xE\n\t"
5194             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5195   ins_encode %{
5196     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5197     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5198     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5199     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5200     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5201     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5202     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5203     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5204     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5205     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5206     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5207     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5208     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5209     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5210     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5211   %}
5212   ins_pipe( pipe_slow );
5213 %}
5214 
5215 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5216   predicate(UseSSE > 3 && UseAVX == 0);
5217   match(Set dst (MulReductionVI src1 src2));
5218   effect(TEMP tmp, TEMP tmp2);
5219   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5220             "pmulld  $tmp2,$src2\n\t"
5221             "movd    $tmp,$src1\n\t"
5222             "pmulld  $tmp2,$tmp\n\t"
5223             "movd    $dst,$tmp2\t! mul reduction2I" %}
5224   ins_encode %{
5225     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5226     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5227     __ movdl($tmp$$XMMRegister, $src1$$Register);
5228     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5229     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5230   %}


5313             "movd     $dst,$tmp2\t! mul reduction8I" %}
5314   ins_encode %{
5315     int vector_len = 0;
5316     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5317     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5318     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5319     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5320     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5321     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5322     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5323     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5324     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5325   %}
5326   ins_pipe( pipe_slow );
5327 %}
5328 
5329 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5330   predicate(UseAVX > 2);
5331   match(Set dst (MulReductionVI src1 src2));
5332   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5333   format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
5334             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5335             "vextracti128   $tmp,$tmp3\n\t"
5336             "vpmulld  $tmp,$tmp,$src2\n\t"
5337             "pshufd   $tmp2,$tmp,0xE\n\t"
5338             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5339             "pshufd   $tmp2,$tmp,0x1\n\t"
5340             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5341             "movd     $tmp2,$src1\n\t"
5342             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5343             "movd     $dst,$tmp2\t! mul reduction16I" %}
5344   ins_encode %{
5345     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5346     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5347     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
5348     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5349     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5350     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5351     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5352     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5353     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5354     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5355     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5356   %}
5357   ins_pipe( pipe_slow );
5358 %}
5359 
5360 #ifdef _LP64
5361 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5362   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5363   match(Set dst (MulReductionVL src1 src2));
5364   effect(TEMP tmp, TEMP tmp2);
5365   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5366             "vpmullq  $tmp,$src2,$tmp2\n\t"
5367             "movdq    $tmp2,$src1\n\t"
5368             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5369             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5370   ins_encode %{
5371     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5372     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5373     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5374     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5375     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5376   %}
5377   ins_pipe( pipe_slow );
5378 %}
5379 
5380 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5381   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5382   match(Set dst (MulReductionVL src1 src2));
5383   effect(TEMP tmp, TEMP tmp2);
5384   format %{ "vextracti128  $tmp,$src2\n\t"
5385             "vpmullq  $tmp2,$tmp,$src2\n\t"
5386             "pshufd   $tmp,$tmp2,0xE\n\t"
5387             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5388             "movdq    $tmp,$src1\n\t"
5389             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5390             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5391   ins_encode %{
5392     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5393     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5394     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5395     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5396     __ movdq($tmp$$XMMRegister, $src1$$Register);
5397     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5398     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5399   %}
5400   ins_pipe( pipe_slow );
5401 %}
5402 
5403 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5404   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5405   match(Set dst (MulReductionVL src1 src2));
5406   effect(TEMP tmp, TEMP tmp2);
5407   format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
5408             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5409             "vextracti128   $tmp,$tmp2\n\t"
5410             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5411             "pshufd   $tmp,$tmp2,0xE\n\t"
5412             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5413             "movdq    $tmp,$src1\n\t"
5414             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5415             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5416   ins_encode %{
5417     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5418     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5419     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
5420     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5421     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5422     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5423     __ movdq($tmp$$XMMRegister, $src1$$Register);
5424     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5425     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5426   %}
5427   ins_pipe( pipe_slow );
5428 %}
5429 #endif
5430 
5431 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5432   predicate(UseSSE >= 1 && UseAVX == 0);
5433   match(Set dst (MulReductionVF dst src2));
5434   effect(TEMP dst, TEMP tmp);
5435   format %{ "mulss   $dst,$src2\n\t"
5436             "pshufd  $tmp,$src2,0x01\n\t"
5437             "mulss   $dst,$tmp\t! mul reduction2F" %}


5438   ins_encode %{
5439     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5440     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5441     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);


5442   %}
5443   ins_pipe( pipe_slow );
5444 %}
5445 
5446 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5447   predicate(UseAVX > 0);
5448   match(Set dst (MulReductionVF dst src2));
5449   effect(TEMP tmp, TEMP dst);
5450   format %{ "vmulss  $dst,$dst,$src2\n\t"
5451             "pshufd  $tmp,$src2,0x01\n\t"
5452             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5453   ins_encode %{
5454     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5455     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5456     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5457   %}
5458   ins_pipe( pipe_slow );
5459 %}
5460 
5461 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5462   predicate(UseSSE >= 1 && UseAVX == 0);
5463   match(Set dst (MulReductionVF dst src2));
5464   effect(TEMP dst, TEMP tmp);
5465   format %{ "mulss   $dst,$src2\n\t"
5466             "pshufd  $tmp,$src2,0x01\n\t"
5467             "mulss   $dst,$tmp\n\t"
5468             "pshufd  $tmp,$src2,0x02\n\t"
5469             "mulss   $dst,$tmp\n\t"
5470             "pshufd  $tmp,$src2,0x03\n\t"
5471             "mulss   $dst,$tmp\t! mul reduction4F" %}
5472   ins_encode %{
5473     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5474     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5475     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5476     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5477     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5478     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5479     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);




5480   %}
5481   ins_pipe( pipe_slow );
5482 %}
5483 
5484 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5485   predicate(UseAVX > 0);
5486   match(Set dst (MulReductionVF dst src2));
5487   effect(TEMP tmp, TEMP dst);
5488   format %{ "vmulss  $dst,$dst,$src2\n\t"
5489             "pshufd  $tmp,$src2,0x01\n\t"
5490             "vmulss  $dst,$dst,$tmp\n\t"
5491             "pshufd  $tmp,$src2,0x02\n\t"
5492             "vmulss  $dst,$dst,$tmp\n\t"
5493             "pshufd  $tmp,$src2,0x03\n\t"
5494             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5495   ins_encode %{
5496     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5497     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5498     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5499     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5500     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5501     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5502     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5503   %}
5504   ins_pipe( pipe_slow );
5505 %}
5506 
5507 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5508   predicate(UseAVX > 0);
5509   match(Set dst (MulReductionVF dst src2));
5510   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5511   format %{ "vmulss  $dst,$dst,$src2\n\t"
5512             "pshufd  $tmp,$src2,0x01\n\t"
5513             "vmulss  $dst,$dst,$tmp\n\t"
5514             "pshufd  $tmp,$src2,0x02\n\t"
5515             "vmulss  $dst,$dst,$tmp\n\t"
5516             "pshufd  $tmp,$src2,0x03\n\t"
5517             "vmulss  $dst,$dst,$tmp\n\t"
5518             "vextractf128  $tmp2,$src2\n\t"
5519             "vmulss  $dst,$dst,$tmp2\n\t"
5520             "pshufd  $tmp,$tmp2,0x01\n\t"
5521             "vmulss  $dst,$dst,$tmp\n\t"
5522             "pshufd  $tmp,$tmp2,0x02\n\t"
5523             "vmulss  $dst,$dst,$tmp\n\t"
5524             "pshufd  $tmp,$tmp2,0x03\n\t"
5525             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5526   ins_encode %{
5527     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5528     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5529     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5530     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5531     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5532     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5533     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5534     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5535     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5536     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5537     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5538     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5539     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5540     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5541     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5542   %}
5543   ins_pipe( pipe_slow );
5544 %}
5545 
5546 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5547   predicate(UseAVX > 2);
5548   match(Set dst (MulReductionVF dst src2));
5549   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5550   format %{ "vmulss  $dst,$dst,$src2\n\t"
5551             "pshufd  $tmp,$src2,0x01\n\t"
5552             "vmulss  $dst,$dst,$tmp\n\t"
5553             "pshufd  $tmp,$src2,0x02\n\t"
5554             "vmulss  $dst,$dst,$tmp\n\t"
5555             "pshufd  $tmp,$src2,0x03\n\t"
5556             "vmulss  $dst,$dst,$tmp\n\t"
5557             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5558             "vmulss  $dst,$dst,$tmp2\n\t"
5559             "pshufd  $tmp,$tmp2,0x01\n\t"
5560             "vmulss  $dst,$dst,$tmp\n\t"
5561             "pshufd  $tmp,$tmp2,0x02\n\t"
5562             "vmulss  $dst,$dst,$tmp\n\t"
5563             "pshufd  $tmp,$tmp2,0x03\n\t"
5564             "vmulss  $dst,$dst,$tmp\n\t"
5565             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5566             "vmulss  $dst,$dst,$tmp2\n\t"
5567             "pshufd  $tmp,$tmp2,0x01\n\t"
5568             "vmulss  $dst,$dst,$tmp\n\t"
5569             "pshufd  $tmp,$tmp2,0x02\n\t"
5570             "vmulss  $dst,$dst,$tmp\n\t"
5571             "pshufd  $tmp,$tmp2,0x03\n\t"
5572             "vmulss  $dst,$dst,$tmp\n\t"
5573             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5574             "vmulss  $dst,$dst,$tmp2\n\t"
5575             "pshufd  $tmp,$tmp2,0x01\n\t"
5576             "vmulss  $dst,$dst,$tmp\n\t"
5577             "pshufd  $tmp,$tmp2,0x02\n\t"
5578             "vmulss  $dst,$dst,$tmp\n\t"
5579             "pshufd  $tmp,$tmp2,0x03\n\t"
5580             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5581   ins_encode %{
5582     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5583     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5584     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5585     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5586     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5587     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5588     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5589     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5590     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5591     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5592     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5593     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5594     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5595     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5596     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5597     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5598     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5599     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5600     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5601     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5602     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5603     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5604     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5605     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5606     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5607     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5608     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5609     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5610     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5611     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5612     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5613   %}
5614   ins_pipe( pipe_slow );
5615 %}
5616 
5617 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5618   predicate(UseSSE >= 1 && UseAVX == 0);
5619   match(Set dst (MulReductionVD dst src2));
5620   effect(TEMP dst, TEMP tmp);
5621   format %{ "mulsd   $dst,$src2\n\t"
5622             "pshufd  $tmp,$src2,0xE\n\t"

5623             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5624   ins_encode %{
5625     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5626     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);

5627     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5628   %}
5629   ins_pipe( pipe_slow );
5630 %}
5631 
5632 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5633   predicate(UseAVX > 0);
5634   match(Set dst (MulReductionVD dst src2));
5635   effect(TEMP tmp, TEMP dst);
5636   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5637             "pshufd  $tmp,$src2,0xE\n\t"
5638             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5639   ins_encode %{
5640     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5641     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5642     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5643   %}
5644   ins_pipe( pipe_slow );
5645 %}
5646 
5647 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5648   predicate(UseAVX > 0);
5649   match(Set dst (MulReductionVD dst src2));
5650   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5651   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5652             "pshufd  $tmp,$src2,0xE\n\t"
5653             "vmulsd  $dst,$dst,$tmp\n\t"
5654             "vextractf128  $tmp2,$src2\n\t"
5655             "vmulsd  $dst,$dst,$tmp2\n\t"
5656             "pshufd  $tmp,$tmp2,0xE\n\t"
5657             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5658   ins_encode %{
5659     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5660     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5661     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5662     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5663     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5664     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5665     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5666   %}
5667   ins_pipe( pipe_slow );
5668 %}
5669 
5670 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5671   predicate(UseAVX > 2);
5672   match(Set dst (MulReductionVD dst src2));
5673   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5674   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5675             "pshufd  $tmp,$src2,0xE\n\t"
5676             "vmulsd  $dst,$dst,$tmp\n\t"
5677             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5678             "vmulsd  $dst,$dst,$tmp2\n\t"
5679             "pshufd  $tmp,$src2,0xE\n\t"
5680             "vmulsd  $dst,$dst,$tmp\n\t"
5681             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5682             "vmulsd  $dst,$dst,$tmp2\n\t"
5683             "pshufd  $tmp,$tmp2,0xE\n\t"
5684             "vmulsd  $dst,$dst,$tmp\n\t"
5685             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5686             "vmulsd  $dst,$dst,$tmp2\n\t"
5687             "pshufd  $tmp,$tmp2,0xE\n\t"
5688             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5689   ins_encode %{
5690     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5691     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5692     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5693     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5694     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5695     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5696     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5697     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5698     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5699     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5700     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5701     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5702     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5703     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5704     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5705   %}
5706   ins_pipe( pipe_slow );
5707 %}
5708 
5709 // ====================VECTOR ARITHMETIC=======================================
5710 
5711 // --------------------------------- ADD --------------------------------------
5712 
5713 // Bytes vector add
5714 instruct vadd4B(vecS dst, vecS src) %{
5715   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5716   match(Set dst (AddVB dst src));
5717   format %{ "paddb   $dst,$src\t! add packed4B" %}
5718   ins_encode %{
5719     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5720   %}
5721   ins_pipe( pipe_slow );
5722 %}
5723 
5724 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5725   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
5726   match(Set dst (AddVB src1 src2));
5727   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5728   ins_encode %{
5729     int vector_len = 0;
5730     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5731   %}
5732   ins_pipe( pipe_slow );
5733 %}
5734 
5735 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5736   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5737   match(Set dst (AddVB src1 src2));
5738   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5739   ins_encode %{
5740     int vector_len = 0;
5741     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5742   %}
5743   ins_pipe( pipe_slow );
5744 %}
5745 
5746 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5747   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5748   match(Set dst (AddVB dst src2));
5749   effect(TEMP src1);
5750   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5751   ins_encode %{
5752     int vector_len = 0;
5753     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5754   %}
5755   ins_pipe( pipe_slow );
5756 %}
5757 
5758 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5759   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
5760   match(Set dst (AddVB src (LoadVector mem)));
5761   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5762   ins_encode %{
5763     int vector_len = 0;
5764     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5765   %}
5766   ins_pipe( pipe_slow );
5767 %}
5768 
5769 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5770   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5771   match(Set dst (AddVB src (LoadVector mem)));
5772   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5773   ins_encode %{
5774     int vector_len = 0;
5775     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5776   %}
5777   ins_pipe( pipe_slow );
5778 %}
5779 
5780 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5781   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5782   match(Set dst (AddVB dst (LoadVector mem)));
5783   effect(TEMP src);
5784   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5785   ins_encode %{
5786     int vector_len = 0;
5787     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5788   %}
5789   ins_pipe( pipe_slow );
5790 %}
5791 
5792 instruct vadd8B(vecD dst, vecD src) %{
5793   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5794   match(Set dst (AddVB dst src));
5795   format %{ "paddb   $dst,$src\t! add packed8B" %}
5796   ins_encode %{
5797     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5798   %}
5799   ins_pipe( pipe_slow );
5800 %}
5801 
5802 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecS src2) %{
5803   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
5804   match(Set dst (AddVB src1 src2));
5805   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5806   ins_encode %{
5807     int vector_len = 0;
5808     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5809   %}
5810   ins_pipe( pipe_slow );
5811 %}
5812 
5813 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5814   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5815   match(Set dst (AddVB src1 src2));
5816   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5817   ins_encode %{
5818     int vector_len = 0;
5819     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5820   %}
5821   ins_pipe( pipe_slow );
5822 %}
5823 
5824 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5825   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5826   match(Set dst (AddVB dst src2));
5827   effect(TEMP src1);
5828   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5829   ins_encode %{
5830     int vector_len = 0;
5831     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5837   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
5838   match(Set dst (AddVB src (LoadVector mem)));
5839   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5840   ins_encode %{
5841     int vector_len = 0;
5842     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5843   %}
5844   ins_pipe( pipe_slow );
5845 %}
5846 
5847 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5848   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5849   match(Set dst (AddVB src (LoadVector mem)));
5850   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5851   ins_encode %{
5852     int vector_len = 0;
5853     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5854   %}
5855   ins_pipe( pipe_slow );
5856 %}
5857 
5858 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5859   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5860   match(Set dst (AddVB dst (LoadVector mem)));
5861   effect(TEMP src);
5862   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5863   ins_encode %{
5864     int vector_len = 0;
5865     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5866   %}
5867   ins_pipe( pipe_slow );
5868 %}
5869 
5870 instruct vadd16B(vecX dst, vecX src) %{
5871   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5872   match(Set dst (AddVB dst src));
5873   format %{ "paddb   $dst,$src\t! add packed16B" %}

5874   ins_encode %{
5875     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5876   %}
5877   ins_pipe( pipe_slow );
5878 %}
5879 
5880 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5881   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
5882   match(Set dst (AddVB src1 src2));
5883   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5884   ins_encode %{
5885     int vector_len = 0;
5886     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5892   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5893   match(Set dst (AddVB src1 src2));
5894   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5895   ins_encode %{
5896     int vector_len = 0;
5897     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5898   %}
5899   ins_pipe( pipe_slow );
5900 %}
5901 
5902 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5903   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5904   match(Set dst (AddVB dst src2));
5905   effect(TEMP src1);
5906   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5907   ins_encode %{
5908     int vector_len = 0;
5909     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5910   %}
5911   ins_pipe( pipe_slow );
5912 %}
5913 
5914 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5915   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
5916   match(Set dst (AddVB src (LoadVector mem)));
5917   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5918   ins_encode %{
5919     int vector_len = 0;
5920     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5921   %}
5922   ins_pipe( pipe_slow );
5923 %}
5924 
5925 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5926   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5927   match(Set dst (AddVB src (LoadVector mem)));
5928   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5929   ins_encode %{
5930     int vector_len = 0;
5931     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5932   %}
5933   ins_pipe( pipe_slow );
5934 %}
5935 
5936 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5937   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5938   match(Set dst (AddVB dst (LoadVector mem)));
5939   effect(TEMP src);
5940   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5941   ins_encode %{
5942     int vector_len = 0;
5943     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5944   %}
5945   ins_pipe( pipe_slow );
5946 %}
5947 
5948 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5949   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5950   match(Set dst (AddVB src1 src2));
5951   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5952   ins_encode %{
5953     int vector_len = 1;
5954     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5955   %}
5956   ins_pipe( pipe_slow );
5957 %}
5958 
5959 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5960   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5961   match(Set dst (AddVB src1 src2));
5962   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5963   ins_encode %{
5964     int vector_len = 1;
5965     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5966   %}
5967   ins_pipe( pipe_slow );
5968 %}
5969 
5970 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5971   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5972   match(Set dst (AddVB dst src2));
5973   effect(TEMP src1);
5974   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5975   ins_encode %{
5976     int vector_len = 1;
5977     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5978   %}
5979   ins_pipe( pipe_slow );
5980 %}
5981 
5982 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5983   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5984   match(Set dst (AddVB src (LoadVector mem)));
5985   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5986   ins_encode %{
5987     int vector_len = 1;
5988     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5989   %}
5990   ins_pipe( pipe_slow );
5991 %}
5992 
5993 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5994   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5995   match(Set dst (AddVB src (LoadVector mem)));
5996   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5997   ins_encode %{
5998     int vector_len = 1;
5999     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6000   %}
6001   ins_pipe( pipe_slow );
6002 %}
6003 
6004 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6005   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6006   match(Set dst (AddVB dst (LoadVector mem)));
6007   effect(TEMP src);
6008   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6009   ins_encode %{
6010     int vector_len = 1;
6011     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6012   %}
6013   ins_pipe( pipe_slow );
6014 %}
6015 
6016 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6017   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6018   match(Set dst (AddVB src1 src2));
6019   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}

6020   ins_encode %{
6021     int vector_len = 2;
6022     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6023   %}
6024   ins_pipe( pipe_slow );
6025 %}
6026 
6027 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6028   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6029   match(Set dst (AddVB src (LoadVector mem)));
6030   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6031   ins_encode %{
6032     int vector_len = 2;
6033     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6034   %}
6035   ins_pipe( pipe_slow );
6036 %}
6037 
6038 // Shorts/Chars vector add
6039 instruct vadd2S(vecS dst, vecS src) %{
6040   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6041   match(Set dst (AddVS dst src));
6042   format %{ "paddw   $dst,$src\t! add packed2S" %}
6043   ins_encode %{
6044     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

6045   %}
6046   ins_pipe( pipe_slow );
6047 %}
6048 
6049 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6050   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
6051   match(Set dst (AddVS src1 src2));
6052   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6053   ins_encode %{
6054     int vector_len = 0;
6055     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6056   %}
6057   ins_pipe( pipe_slow );
6058 %}
6059 
6060 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6061   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6062   match(Set dst (AddVS src1 src2));
6063   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6064   ins_encode %{
6065     int vector_len = 0;
6066     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6067   %}
6068   ins_pipe( pipe_slow );
6069 %}
6070 
6071 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6072   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6073   match(Set dst (AddVS dst src2));
6074   effect(TEMP src1);
6075   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
6076   ins_encode %{
6077     int vector_len = 0;
6078     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6079   %}
6080   ins_pipe( pipe_slow );
6081 %}
6082 
6083 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
6084   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
6085   match(Set dst (AddVS src (LoadVector mem)));
6086   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6087   ins_encode %{
6088     int vector_len = 0;
6089     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6090   %}
6091   ins_pipe( pipe_slow );
6092 %}
6093 
6094 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
6095   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6096   match(Set dst (AddVS src (LoadVector mem)));
6097   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6098   ins_encode %{
6099     int vector_len = 0;
6100     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6101   %}
6102   ins_pipe( pipe_slow );
6103 %}
6104 
6105 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6106   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6107   match(Set dst (AddVS dst (LoadVector mem)));
6108   effect(TEMP src);
6109   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6110   ins_encode %{
6111     int vector_len = 0;
6112     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6113   %}
6114   ins_pipe( pipe_slow );
6115 %}
6116 
6117 instruct vadd4S(vecD dst, vecD src) %{
6118   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6119   match(Set dst (AddVS dst src));
6120   format %{ "paddw   $dst,$src\t! add packed4S" %}
6121   ins_encode %{
6122     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

6123   %}
6124   ins_pipe( pipe_slow );
6125 %}
6126 
6127 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecS src2) %{
6128   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
6129   match(Set dst (AddVS src1 src2));
6130   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}

6131   ins_encode %{
6132     int vector_len = 0;
6133     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6134   %}
6135   ins_pipe( pipe_slow );
6136 %}
6137 
6138 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6139   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6140   match(Set dst (AddVS src1 src2));
6141   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6142   ins_encode %{
6143     int vector_len = 0;
6144     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6145   %}
6146   ins_pipe( pipe_slow );
6147 %}
6148 
6149 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6150   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6151   match(Set dst (AddVS dst src2));
6152   effect(TEMP src1);
6153   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
6154   ins_encode %{
6155     int vector_len = 0;
6156     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6157   %}
6158   ins_pipe( pipe_slow );
6159 %}
6160 
6161 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
6162   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
6163   match(Set dst (AddVS src (LoadVector mem)));
6164   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6165   ins_encode %{
6166     int vector_len = 0;
6167     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6168   %}
6169   ins_pipe( pipe_slow );
6170 %}
6171 
6172 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
6173   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6174   match(Set dst (AddVS src (LoadVector mem)));
6175   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6176   ins_encode %{
6177     int vector_len = 0;
6178     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6179   %}
6180   ins_pipe( pipe_slow );
6181 %}
6182 
6183 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6184   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6185   match(Set dst (AddVS dst (LoadVector mem)));
6186   effect(TEMP src);
6187   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6188   ins_encode %{
6189     int vector_len = 0;
6190     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6191   %}
6192   ins_pipe( pipe_slow );
6193 %}
6194 
6195 instruct vadd8S(vecX dst, vecX src) %{
6196   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6197   match(Set dst (AddVS dst src));
6198   format %{ "paddw   $dst,$src\t! add packed8S" %}
6199   ins_encode %{
6200     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

6201   %}
6202   ins_pipe( pipe_slow );
6203 %}
6204 
6205 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6206   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
6207   match(Set dst (AddVS src1 src2));
6208   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}

6209   ins_encode %{
6210     int vector_len = 0;
6211     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6212   %}
6213   ins_pipe( pipe_slow );
6214 %}
6215 
6216 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6217   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6218   match(Set dst (AddVS src1 src2));
6219   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6220   ins_encode %{
6221     int vector_len = 0;
6222     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6223   %}
6224   ins_pipe( pipe_slow );
6225 %}
6226 
6227 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6228   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6229   match(Set dst (AddVS dst src2));
6230   effect(TEMP src1);
6231   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
6232   ins_encode %{
6233     int vector_len = 0;
6234     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6235   %}
6236   ins_pipe( pipe_slow );
6237 %}
6238 
6239 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
6240   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
6241   match(Set dst (AddVS src (LoadVector mem)));
6242   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6243   ins_encode %{
6244     int vector_len = 0;
6245     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6246   %}
6247   ins_pipe( pipe_slow );
6248 %}
6249 
6250 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
6251   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6252   match(Set dst (AddVS src (LoadVector mem)));
6253   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6254   ins_encode %{
6255     int vector_len = 0;
6256     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6257   %}
6258   ins_pipe( pipe_slow );
6259 %}
6260 
6261 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
6262   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6263   match(Set dst (AddVS dst (LoadVector mem)));
6264   effect(TEMP src);
6265   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6266   ins_encode %{
6267     int vector_len = 0;
6268     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6269   %}
6270   ins_pipe( pipe_slow );
6271 %}
6272 
6273 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6274   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6275   match(Set dst (AddVS src1 src2));
6276   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6277   ins_encode %{
6278     int vector_len = 1;
6279     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6280   %}
6281   ins_pipe( pipe_slow );
6282 %}
6283 
6284 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6285   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6286   match(Set dst (AddVS src1 src2));
6287   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6288   ins_encode %{
6289     int vector_len = 1;
6290     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6291   %}
6292   ins_pipe( pipe_slow );
6293 %}
6294 
6295 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6296   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6297   match(Set dst (AddVS dst src2));
6298   effect(TEMP src1);
6299   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6300   ins_encode %{
6301     int vector_len = 1;
6302     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6303   %}
6304   ins_pipe( pipe_slow );
6305 %}
6306 
6307 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6308   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6309   match(Set dst (AddVS src (LoadVector mem)));
6310   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6311   ins_encode %{
6312     int vector_len = 1;
6313     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6314   %}
6315   ins_pipe( pipe_slow );
6316 %}
6317 
6318 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6319   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6320   match(Set dst (AddVS src (LoadVector mem)));
6321   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6322   ins_encode %{
6323     int vector_len = 1;
6324     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6325   %}
6326   ins_pipe( pipe_slow );
6327 %}
6328 
6329 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6330   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6331   match(Set dst (AddVS dst (LoadVector mem)));
6332   effect(TEMP src);
6333   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6334   ins_encode %{
6335     int vector_len = 1;
6336     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6337   %}
6338   ins_pipe( pipe_slow );
6339 %}
6340 
6341 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6342   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6343   match(Set dst (AddVS src1 src2));
6344   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6345   ins_encode %{
6346     int vector_len = 2;
6347     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6348   %}
6349   ins_pipe( pipe_slow );
6350 %}
6351 
6352 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6353   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6354   match(Set dst (AddVS src (LoadVector mem)));
6355   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6356   ins_encode %{
6357     int vector_len = 2;
6358     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6359   %}
6360   ins_pipe( pipe_slow );
6361 %}
6362 
6363 // Integers vector add
6364 instruct vadd2I(vecD dst, vecD src) %{
6365   predicate(n->as_Vector()->length() == 2);
6366   match(Set dst (AddVI dst src));
6367   format %{ "paddd   $dst,$src\t! add packed2I" %}
6368   ins_encode %{
6369     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6370   %}
6371   ins_pipe( pipe_slow );
6372 %}
6373 
6374 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6375   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6376   match(Set dst (AddVI src1 src2));
6377   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6378   ins_encode %{
6379     int vector_len = 0;
6380     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6381   %}
6382   ins_pipe( pipe_slow );
6383 %}
6384 
6385 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6386   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6387   match(Set dst (AddVI src (LoadVector mem)));
6388   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6389   ins_encode %{
6390     int vector_len = 0;
6391     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6392   %}
6393   ins_pipe( pipe_slow );
6394 %}
6395 
6396 instruct vadd4I(vecX dst, vecX src) %{
6397   predicate(n->as_Vector()->length() == 4);
6398   match(Set dst (AddVI dst src));
6399   format %{ "paddd   $dst,$src\t! add packed4I" %}
6400   ins_encode %{
6401     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6402   %}
6403   ins_pipe( pipe_slow );
6404 %}
6405 
6406 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6407   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6408   match(Set dst (AddVI src1 src2));
6409   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6410   ins_encode %{
6411     int vector_len = 0;
6412     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6413   %}
6414   ins_pipe( pipe_slow );
6415 %}
6416 
6417 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6418   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6419   match(Set dst (AddVI src (LoadVector mem)));
6420   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6421   ins_encode %{
6422     int vector_len = 0;
6423     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6424   %}
6425   ins_pipe( pipe_slow );
6426 %}
6427 
6428 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6429   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6430   match(Set dst (AddVI src1 src2));
6431   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6432   ins_encode %{
6433     int vector_len = 1;
6434     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6435   %}
6436   ins_pipe( pipe_slow );
6437 %}
6438 
6439 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6440   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6441   match(Set dst (AddVI src (LoadVector mem)));
6442   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6443   ins_encode %{
6444     int vector_len = 1;
6445     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6446   %}
6447   ins_pipe( pipe_slow );
6448 %}
6449 
6450 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6451   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6452   match(Set dst (AddVI src1 src2));
6453   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6454   ins_encode %{
6455     int vector_len = 2;
6456     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6457   %}
6458   ins_pipe( pipe_slow );
6459 %}
6460 
6461 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6462   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6463   match(Set dst (AddVI src (LoadVector mem)));
6464   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6465   ins_encode %{
6466     int vector_len = 2;
6467     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6468   %}
6469   ins_pipe( pipe_slow );
6470 %}
6471 
6472 // Longs vector add
6473 instruct vadd2L(vecX dst, vecX src) %{
6474   predicate(n->as_Vector()->length() == 2);
6475   match(Set dst (AddVL dst src));
6476   format %{ "paddq   $dst,$src\t! add packed2L" %}
6477   ins_encode %{
6478     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6479   %}
6480   ins_pipe( pipe_slow );
6481 %}
6482 
6483 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6484   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6485   match(Set dst (AddVL src1 src2));
6486   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6487   ins_encode %{
6488     int vector_len = 0;
6489     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6490   %}
6491   ins_pipe( pipe_slow );
6492 %}
6493 
6494 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6495   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6496   match(Set dst (AddVL src (LoadVector mem)));
6497   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6498   ins_encode %{
6499     int vector_len = 0;
6500     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6501   %}
6502   ins_pipe( pipe_slow );
6503 %}
6504 
6505 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6506   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6507   match(Set dst (AddVL src1 src2));
6508   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6509   ins_encode %{
6510     int vector_len = 1;
6511     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6512   %}
6513   ins_pipe( pipe_slow );
6514 %}
6515 
6516 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6517   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6518   match(Set dst (AddVL src (LoadVector mem)));
6519   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6520   ins_encode %{
6521     int vector_len = 1;
6522     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6523   %}
6524   ins_pipe( pipe_slow );
6525 %}
6526 
6527 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6528   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6529   match(Set dst (AddVL src1 src2));
6530   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6531   ins_encode %{
6532     int vector_len = 2;
6533     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6534   %}
6535   ins_pipe( pipe_slow );
6536 %}
6537 
6538 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6539   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6540   match(Set dst (AddVL src (LoadVector mem)));
6541   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6542   ins_encode %{
6543     int vector_len = 2;
6544     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6545   %}
6546   ins_pipe( pipe_slow );
6547 %}
6548 
6549 // Floats vector add
6550 instruct vadd2F(vecD dst, vecD src) %{
6551   predicate(n->as_Vector()->length() == 2);
6552   match(Set dst (AddVF dst src));
6553   format %{ "addps   $dst,$src\t! add packed2F" %}
6554   ins_encode %{
6555     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6556   %}
6557   ins_pipe( pipe_slow );
6558 %}
6559 
6560 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6561   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6562   match(Set dst (AddVF src1 src2));
6563   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6564   ins_encode %{
6565     int vector_len = 0;
6566     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6567   %}
6568   ins_pipe( pipe_slow );
6569 %}
6570 
6571 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6572   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6573   match(Set dst (AddVF src (LoadVector mem)));
6574   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6575   ins_encode %{
6576     int vector_len = 0;
6577     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6578   %}
6579   ins_pipe( pipe_slow );
6580 %}
6581 
6582 instruct vadd4F(vecX dst, vecX src) %{
6583   predicate(n->as_Vector()->length() == 4);
6584   match(Set dst (AddVF dst src));
6585   format %{ "addps   $dst,$src\t! add packed4F" %}
6586   ins_encode %{
6587     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6588   %}
6589   ins_pipe( pipe_slow );
6590 %}
6591 
6592 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6593   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6594   match(Set dst (AddVF src1 src2));
6595   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6596   ins_encode %{
6597     int vector_len = 0;
6598     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6599   %}
6600   ins_pipe( pipe_slow );
6601 %}
6602 
6603 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6604   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6605   match(Set dst (AddVF src (LoadVector mem)));
6606   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6607   ins_encode %{
6608     int vector_len = 0;
6609     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6610   %}
6611   ins_pipe( pipe_slow );
6612 %}
6613 
6614 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6615   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6616   match(Set dst (AddVF src1 src2));
6617   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6618   ins_encode %{
6619     int vector_len = 1;
6620     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6621   %}
6622   ins_pipe( pipe_slow );
6623 %}
6624 
6625 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6626   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6627   match(Set dst (AddVF src (LoadVector mem)));
6628   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6629   ins_encode %{
6630     int vector_len = 1;
6631     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6632   %}
6633   ins_pipe( pipe_slow );
6634 %}
6635 
6636 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6637   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6638   match(Set dst (AddVF src1 src2));
6639   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6640   ins_encode %{
6641     int vector_len = 2;
6642     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6643   %}
6644   ins_pipe( pipe_slow );
6645 %}
6646 
6647 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6648   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6649   match(Set dst (AddVF src (LoadVector mem)));
6650   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6651   ins_encode %{
6652     int vector_len = 2;
6653     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6654   %}
6655   ins_pipe( pipe_slow );
6656 %}
6657 
6658 // Doubles vector add
6659 instruct vadd2D(vecX dst, vecX src) %{
6660   predicate(n->as_Vector()->length() == 2);
6661   match(Set dst (AddVD dst src));
6662   format %{ "addpd   $dst,$src\t! add packed2D" %}
6663   ins_encode %{
6664     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6665   %}
6666   ins_pipe( pipe_slow );
6667 %}
6668 
6669 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6670   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6671   match(Set dst (AddVD src1 src2));
6672   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6673   ins_encode %{
6674     int vector_len = 0;
6675     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6676   %}
6677   ins_pipe( pipe_slow );
6678 %}
6679 
6680 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6681   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6682   match(Set dst (AddVD src (LoadVector mem)));
6683   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6684   ins_encode %{
6685     int vector_len = 0;
6686     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6687   %}
6688   ins_pipe( pipe_slow );
6689 %}
6690 
6691 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6692   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6693   match(Set dst (AddVD src1 src2));
6694   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6695   ins_encode %{
6696     int vector_len = 1;
6697     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6698   %}
6699   ins_pipe( pipe_slow );
6700 %}
6701 
6702 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6703   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6704   match(Set dst (AddVD src (LoadVector mem)));
6705   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6706   ins_encode %{
6707     int vector_len = 1;
6708     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6709   %}
6710   ins_pipe( pipe_slow );
6711 %}
6712 
6713 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6714   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6715   match(Set dst (AddVD src1 src2));
6716   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6717   ins_encode %{
6718     int vector_len = 2;
6719     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6720   %}
6721   ins_pipe( pipe_slow );
6722 %}
6723 
6724 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6725   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6726   match(Set dst (AddVD src (LoadVector mem)));
6727   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6728   ins_encode %{
6729     int vector_len = 2;
6730     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6731   %}
6732   ins_pipe( pipe_slow );
6733 %}
6734 
6735 // --------------------------------- SUB --------------------------------------
6736 
6737 // Bytes vector sub
6738 instruct vsub4B(vecS dst, vecS src) %{
6739   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6740   match(Set dst (SubVB dst src));
6741   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6742   ins_encode %{
6743     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6744   %}
6745   ins_pipe( pipe_slow );
6746 %}
6747 
6748 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6749   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
6750   match(Set dst (SubVB src1 src2));
6751   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6752   ins_encode %{
6753     int vector_len = 0;
6754     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6755   %}
6756   ins_pipe( pipe_slow );
6757 %}
6758 
6759 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6760   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6761   match(Set dst (SubVB src1 src2));
6762   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6763   ins_encode %{
6764     int vector_len = 0;
6765     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6766   %}
6767   ins_pipe( pipe_slow );
6768 %}
6769 
6770 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6771   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6772   match(Set dst (SubVB dst src2));
6773   effect(TEMP src1);
6774   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6775   ins_encode %{
6776     int vector_len = 0;
6777     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6778   %}
6779   ins_pipe( pipe_slow );
6780 %}
6781 
6782 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6783   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
6784   match(Set dst (SubVB src (LoadVector mem)));
6785   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6786   ins_encode %{
6787     int vector_len = 0;
6788     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6789   %}
6790   ins_pipe( pipe_slow );
6791 %}
6792 
6793 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6794   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6795   match(Set dst (SubVB src (LoadVector mem)));
6796   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6797   ins_encode %{
6798     int vector_len = 0;
6799     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6800   %}
6801   ins_pipe( pipe_slow );
6802 %}
6803 
6804 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6805   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6806   match(Set dst (SubVB dst (LoadVector mem)));
6807   effect(TEMP src);
6808   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6809   ins_encode %{
6810     int vector_len = 0;
6811     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6812   %}
6813   ins_pipe( pipe_slow );
6814 %}
6815 
6816 instruct vsub8B(vecD dst, vecD src) %{
6817   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6818   match(Set dst (SubVB dst src));
6819   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6820   ins_encode %{
6821     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6822   %}
6823   ins_pipe( pipe_slow );
6824 %}
6825 
6826 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6827   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
6828   match(Set dst (SubVB src1 src2));
6829   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6830   ins_encode %{
6831     int vector_len = 0;
6832     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6833   %}
6834   ins_pipe( pipe_slow );
6835 %}
6836 
6837 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6838   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6839   match(Set dst (SubVB src1 src2));
6840   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6841   ins_encode %{
6842     int vector_len = 0;
6843     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6844   %}
6845   ins_pipe( pipe_slow );
6846 %}
6847 
6848 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6849   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6850   match(Set dst (SubVB dst src2));
6851   effect(TEMP src1);
6852   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6853   ins_encode %{
6854     int vector_len = 0;
6855     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6856   %}
6857   ins_pipe( pipe_slow );
6858 %}
6859 
6860 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6861   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
6862   match(Set dst (SubVB src (LoadVector mem)));
6863   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6864   ins_encode %{
6865     int vector_len = 0;
6866     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6867   %}
6868   ins_pipe( pipe_slow );
6869 %}
6870 
6871 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6872   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6873   match(Set dst (SubVB src (LoadVector mem)));
6874   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6875   ins_encode %{
6876     int vector_len = 0;
6877     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6878   %}
6879   ins_pipe( pipe_slow );
6880 %}
6881 
6882 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6883   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6884   match(Set dst (SubVB dst (LoadVector mem)));
6885   effect(TEMP src);
6886   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6887   ins_encode %{
6888     int vector_len = 0;
6889     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6890   %}
6891   ins_pipe( pipe_slow );
6892 %}
6893 
6894 instruct vsub16B(vecX dst, vecX src) %{
6895   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6896   match(Set dst (SubVB dst src));
6897   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6898   ins_encode %{
6899     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6900   %}
6901   ins_pipe( pipe_slow );
6902 %}
6903 
6904 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6905   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6906   match(Set dst (SubVB src1 src2));
6907   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6908   ins_encode %{
6909     int vector_len = 0;
6910     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6911   %}
6912   ins_pipe( pipe_slow );
6913 %}
6914 
6915 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6916   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6917   match(Set dst (SubVB src1 src2));
6918   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6919   ins_encode %{
6920     int vector_len = 0;
6921     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6922   %}
6923   ins_pipe( pipe_slow );
6924 %}
6925 
6926 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6927   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6928   match(Set dst (SubVB dst src2));
6929   effect(TEMP src1);
6930   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6931   ins_encode %{
6932     int vector_len = 0;
6933     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6934   %}
6935   ins_pipe( pipe_slow );
6936 %}
6937 
6938 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6939   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6940   match(Set dst (SubVB src (LoadVector mem)));
6941   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6942   ins_encode %{
6943     int vector_len = 0;
6944     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6945   %}
6946   ins_pipe( pipe_slow );
6947 %}
6948 
6949 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6950   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6951   match(Set dst (SubVB src (LoadVector mem)));
6952   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6953   ins_encode %{
6954     int vector_len = 0;
6955     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6956   %}
6957   ins_pipe( pipe_slow );
6958 %}
6959 
6960 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6961   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6962   match(Set dst (SubVB dst (LoadVector mem)));
6963   effect(TEMP src);
6964   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6965   ins_encode %{
6966     int vector_len = 0;
6967     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6968   %}
6969   ins_pipe( pipe_slow );
6970 %}
6971 
6972 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6973   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6974   match(Set dst (SubVB src1 src2));
6975   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6976   ins_encode %{
6977     int vector_len = 1;
6978     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6979   %}
6980   ins_pipe( pipe_slow );
6981 %}
6982 
6983 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6984   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6985   match(Set dst (SubVB src1 src2));
6986   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6987   ins_encode %{
6988     int vector_len = 1;
6989     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6990   %}
6991   ins_pipe( pipe_slow );
6992 %}
6993 
6994 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6995   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6996   match(Set dst (SubVB dst src2));
6997   effect(TEMP src1);
6998   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6999   ins_encode %{
7000     int vector_len = 1;
7001     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7002   %}
7003   ins_pipe( pipe_slow );
7004 %}
7005 
7006 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
7007   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
7008   match(Set dst (SubVB src (LoadVector mem)));
7009   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7010   ins_encode %{
7011     int vector_len = 1;
7012     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7013   %}
7014   ins_pipe( pipe_slow );
7015 %}
7016 
7017 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
7018   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7019   match(Set dst (SubVB src (LoadVector mem)));
7020   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7021   ins_encode %{
7022     int vector_len = 1;
7023     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7024   %}
7025   ins_pipe( pipe_slow );
7026 %}
7027 
7028 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
7029   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
7030   match(Set dst (SubVB dst (LoadVector mem)));
7031   effect(TEMP src);
7032   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7033   ins_encode %{
7034     int vector_len = 1;
7035     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7036   %}
7037   ins_pipe( pipe_slow );
7038 %}
7039 
7040 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
7041   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7042   match(Set dst (SubVB src1 src2));
7043   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
7044   ins_encode %{
7045     int vector_len = 2;
7046     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7047   %}
7048   ins_pipe( pipe_slow );
7049 %}
7050 
7051 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
7052   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7053   match(Set dst (SubVB src (LoadVector mem)));
7054   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
7055   ins_encode %{
7056     int vector_len = 2;
7057     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7058   %}
7059   ins_pipe( pipe_slow );
7060 %}
7061 
7062 // Shorts/Chars vector sub
7063 instruct vsub2S(vecS dst, vecS src) %{
7064   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7065   match(Set dst (SubVS dst src));
7066   format %{ "psubw   $dst,$src\t! sub packed2S" %}
7067   ins_encode %{
7068     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7069   %}
7070   ins_pipe( pipe_slow );
7071 %}
7072 
7073 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7074   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
7075   match(Set dst (SubVS src1 src2));
7076   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7077   ins_encode %{
7078     int vector_len = 0;
7079     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7080   %}
7081   ins_pipe( pipe_slow );
7082 %}
7083 
7084 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7085   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7086   match(Set dst (SubVS src1 src2));
7087   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7088   ins_encode %{
7089     int vector_len = 0;
7090     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7091   %}
7092   ins_pipe( pipe_slow );
7093 %}
7094 
7095 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
7096   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7097   match(Set dst (SubVS dst src2));
7098   effect(TEMP src1);
7099   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7100   ins_encode %{
7101     int vector_len = 0;
7102     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7103   %}
7104   ins_pipe( pipe_slow );
7105 %}
7106 
7107 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
7108   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
7109   match(Set dst (SubVS src (LoadVector mem)));
7110   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7111   ins_encode %{
7112     int vector_len = 0;
7113     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7114   %}
7115   ins_pipe( pipe_slow );
7116 %}
7117 
7118 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
7119   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7120   match(Set dst (SubVS src (LoadVector mem)));
7121   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7122   ins_encode %{
7123     int vector_len = 0;
7124     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7125   %}
7126   ins_pipe( pipe_slow );
7127 %}
7128 
7129 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7130   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7131   match(Set dst (SubVS dst (LoadVector mem)));
7132   effect(TEMP src);
7133   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7134   ins_encode %{
7135     int vector_len = 0;
7136     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7137   %}
7138   ins_pipe( pipe_slow );
7139 %}
7140 
7141 instruct vsub4S(vecD dst, vecD src) %{
7142   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7143   match(Set dst (SubVS dst src));
7144   format %{ "psubw   $dst,$src\t! sub packed4S" %}
7145   ins_encode %{
7146     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7147   %}
7148   ins_pipe( pipe_slow );
7149 %}
7150 
7151 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7152   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
7153   match(Set dst (SubVS src1 src2));
7154   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7155   ins_encode %{
7156     int vector_len = 0;
7157     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7158   %}
7159   ins_pipe( pipe_slow );
7160 %}
7161 
7162 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7163   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7164   match(Set dst (SubVS src1 src2));
7165   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7166   ins_encode %{
7167     int vector_len = 0;
7168     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7169   %}
7170   ins_pipe( pipe_slow );
7171 %}
7172 
7173 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7174   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7175   match(Set dst (SubVS dst src2));
7176   effect(TEMP src1);
7177   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7178   ins_encode %{
7179     int vector_len = 0;
7180     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7181   %}
7182   ins_pipe( pipe_slow );
7183 %}
7184 
7185 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
7186   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
7187   match(Set dst (SubVS src (LoadVector mem)));
7188   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7189   ins_encode %{
7190     int vector_len = 0;
7191     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7192   %}
7193   ins_pipe( pipe_slow );
7194 %}
7195 
7196 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
7197   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7198   match(Set dst (SubVS src (LoadVector mem)));
7199   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7200   ins_encode %{
7201     int vector_len = 0;
7202     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7203   %}
7204   ins_pipe( pipe_slow );
7205 %}
7206 
7207 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7208   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7209   match(Set dst (SubVS dst (LoadVector mem)));
7210   effect(TEMP src);
7211   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7212   ins_encode %{
7213     int vector_len = 0;
7214     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7215   %}
7216   ins_pipe( pipe_slow );
7217 %}
7218 
7219 instruct vsub8S(vecX dst, vecX src) %{
7220   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);

7221   match(Set dst (SubVS dst src));
7222   format %{ "psubw   $dst,$src\t! sub packed8S" %}
7223   ins_encode %{
7224     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7225   %}
7226   ins_pipe( pipe_slow );
7227 %}
7228 
7229 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7230   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
7231   match(Set dst (SubVS src1 src2));
7232   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7233   ins_encode %{
7234     int vector_len = 0;
7235     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7236   %}
7237   ins_pipe( pipe_slow );
7238 %}
7239 
7240 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7241   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7242   match(Set dst (SubVS src1 src2));
7243   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7244   ins_encode %{
7245     int vector_len = 0;
7246     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7247   %}
7248   ins_pipe( pipe_slow );
7249 %}
7250 
7251 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7252   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7253   match(Set dst (SubVS dst src2));
7254   effect(TEMP src1);
7255   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7256   ins_encode %{
7257     int vector_len = 0;
7258     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7259   %}
7260   ins_pipe( pipe_slow );
7261 %}
7262 
7263 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
7264   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
7265   match(Set dst (SubVS src (LoadVector mem)));
7266   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7267   ins_encode %{
7268     int vector_len = 0;
7269     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7270   %}
7271   ins_pipe( pipe_slow );
7272 %}
7273 
7274 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
7275   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7276   match(Set dst (SubVS src (LoadVector mem)));
7277   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7278   ins_encode %{
7279     int vector_len = 0;
7280     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7281   %}
7282   ins_pipe( pipe_slow );
7283 %}
7284 
7285 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7286   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7287   match(Set dst (SubVS dst (LoadVector mem)));
7288   effect(TEMP src);
7289   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7290   ins_encode %{
7291     int vector_len = 0;
7292     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7293   %}
7294   ins_pipe( pipe_slow );
7295 %}
7296 
7297 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7298   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7299   match(Set dst (SubVS src1 src2));
7300   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7301   ins_encode %{
7302     int vector_len = 1;
7303     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7304   %}
7305   ins_pipe( pipe_slow );
7306 %}
7307 
7308 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7309   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7310   match(Set dst (SubVS src1 src2));
7311   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7312   ins_encode %{
7313     int vector_len = 1;
7314     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7315   %}
7316   ins_pipe( pipe_slow );
7317 %}
7318 
7319 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7320   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7321   match(Set dst (SubVS dst src2));
7322   effect(TEMP src1);
7323   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7324   ins_encode %{
7325     int vector_len = 1;
7326     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7327   %}
7328   ins_pipe( pipe_slow );
7329 %}
7330 
7331 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7332   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7333   match(Set dst (SubVS src (LoadVector mem)));
7334   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7335   ins_encode %{
7336     int vector_len = 1;
7337     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7338   %}
7339   ins_pipe( pipe_slow );
7340 %}
7341 
7342 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7343   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7344   match(Set dst (SubVS src (LoadVector mem)));
7345   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7346   ins_encode %{
7347     int vector_len = 1;
7348     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7349   %}
7350   ins_pipe( pipe_slow );
7351 %}
7352 
7353 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7354   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7355   match(Set dst (SubVS dst (LoadVector mem)));
7356    effect(TEMP src);
7357   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7358   ins_encode %{
7359     int vector_len = 1;
7360     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7361   %}
7362   ins_pipe( pipe_slow );
7363 %}
7364 
7365 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7366   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7367   match(Set dst (SubVS src1 src2));
7368   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7369   ins_encode %{
7370     int vector_len = 2;
7371     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7372   %}
7373   ins_pipe( pipe_slow );
7374 %}
7375 
7376 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7377   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7378   match(Set dst (SubVS src (LoadVector mem)));
7379   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7380   ins_encode %{
7381     int vector_len = 2;
7382     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7383   %}
7384   ins_pipe( pipe_slow );
7385 %}
7386 
7387 // Integers vector sub
7388 instruct vsub2I(vecD dst, vecD src) %{
7389   predicate(n->as_Vector()->length() == 2);
7390   match(Set dst (SubVI dst src));
7391   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7392   ins_encode %{
7393     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7394   %}
7395   ins_pipe( pipe_slow );
7396 %}
7397 


7706   match(Set dst (SubVD src (LoadVector mem)));
7707   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7708   ins_encode %{
7709     int vector_len = 0;
7710     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7711   %}
7712   ins_pipe( pipe_slow );
7713 %}
7714 
7715 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7716   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7717   match(Set dst (SubVD src1 src2));
7718   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7719   ins_encode %{
7720     int vector_len = 1;
7721     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7722   %}
7723   ins_pipe( pipe_slow );
7724 %}
7725 
7726 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7727   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7728   match(Set dst (SubVD src (LoadVector mem)));
7729   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7730   ins_encode %{
7731     int vector_len = 1;
7732     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7733   %}
7734   ins_pipe( pipe_slow );
7735 %}
7736 
7737 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7738   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7739   match(Set dst (SubVD src1 src2));
7740   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7741   ins_encode %{
7742     int vector_len = 2;
7743     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7744   %}
7745   ins_pipe( pipe_slow );
7746 %}
7747 
7748 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7749   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7750   match(Set dst (SubVD src (LoadVector mem)));
7751   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7752   ins_encode %{
7753     int vector_len = 2;
7754     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7755   %}
7756   ins_pipe( pipe_slow );
7757 %}
7758 
7759 // --------------------------------- MUL --------------------------------------
7760 
7761 // Shorts/Chars vector mul
7762 instruct vmul2S(vecS dst, vecS src) %{
7763   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7764   match(Set dst (MulVS dst src));
7765   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7766   ins_encode %{
7767     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7768   %}
7769   ins_pipe( pipe_slow );
7770 %}
7771 
7772 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7773   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
7774   match(Set dst (MulVS src1 src2));
7775   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7776   ins_encode %{
7777     int vector_len = 0;
7778     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7779   %}
7780   ins_pipe( pipe_slow );
7781 %}
7782 
7783 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7784   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7785   match(Set dst (MulVS src1 src2));
7786   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7787   ins_encode %{
7788     int vector_len = 0;
7789     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7790   %}
7791   ins_pipe( pipe_slow );
7792 %}
7793 
7794 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7795   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7796   match(Set dst (MulVS dst src2));
7797   effect(TEMP src1);
7798   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7799   ins_encode %{
7800     int vector_len = 0;
7801     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7802   %}
7803   ins_pipe( pipe_slow );
7804 %}
7805 
7806 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7807   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
7808   match(Set dst (MulVS src (LoadVector mem)));
7809   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7810   ins_encode %{
7811     int vector_len = 0;
7812     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7813   %}
7814   ins_pipe( pipe_slow );
7815 %}
7816 
7817 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7818   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7819   match(Set dst (MulVS src (LoadVector mem)));
7820   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7821   ins_encode %{
7822     int vector_len = 0;
7823     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7824   %}
7825   ins_pipe( pipe_slow );
7826 %}
7827 
7828 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7829   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7830   match(Set dst (MulVS dst (LoadVector mem)));
7831   effect(TEMP src);
7832   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7833   ins_encode %{
7834     int vector_len = 0;
7835     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7836   %}
7837   ins_pipe( pipe_slow );
7838 %}
7839 
7840 instruct vmul4S(vecD dst, vecD src) %{
7841   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7842   match(Set dst (MulVS dst src));
7843   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7844   ins_encode %{
7845     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7846   %}
7847   ins_pipe( pipe_slow );
7848 %}
7849 
7850 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7851   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
7852   match(Set dst (MulVS src1 src2));
7853   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7854   ins_encode %{
7855     int vector_len = 0;
7856     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7857   %}
7858   ins_pipe( pipe_slow );
7859 %}
7860 
7861 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7862   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7863   match(Set dst (MulVS src1 src2));
7864   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7865   ins_encode %{
7866     int vector_len = 0;
7867     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7868   %}
7869   ins_pipe( pipe_slow );
7870 %}
7871 
7872 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7873   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7874   match(Set dst (MulVS dst src2));
7875   effect(TEMP src1);
7876   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7877   ins_encode %{
7878     int vector_len = 0;
7879     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7880   %}
7881   ins_pipe( pipe_slow );
7882 %}
7883 
7884 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7885   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
7886   match(Set dst (MulVS src (LoadVector mem)));
7887   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7888   ins_encode %{
7889     int vector_len = 0;
7890     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7891   %}
7892   ins_pipe( pipe_slow );
7893 %}
7894 
7895 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7896   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7897   match(Set dst (MulVS src (LoadVector mem)));
7898   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7899   ins_encode %{
7900     int vector_len = 0;
7901     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7902   %}
7903   ins_pipe( pipe_slow );
7904 %}
7905 
7906 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7907   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7908   match(Set dst (MulVS dst (LoadVector mem)));
7909   effect(TEMP src);
7910   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7911   ins_encode %{
7912     int vector_len = 0;
7913     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7914   %}
7915   ins_pipe( pipe_slow );
7916 %}
7917 
7918 instruct vmul8S(vecX dst, vecX src) %{
7919   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);



7920   match(Set dst (MulVS dst src));
7921   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7922   ins_encode %{
7923     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7924   %}
7925   ins_pipe( pipe_slow );
7926 %}
7927 
7928 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7929   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
7930   match(Set dst (MulVS src1 src2));
7931   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7932   ins_encode %{
7933     int vector_len = 0;
7934     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7935   %}
7936   ins_pipe( pipe_slow );
7937 %}
7938 
7939 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7940   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7941   match(Set dst (MulVS src1 src2));
7942   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7943   ins_encode %{
7944     int vector_len = 0;
7945     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7946   %}
7947   ins_pipe( pipe_slow );
7948 %}
7949 
7950 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7951   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7952   match(Set dst (MulVS dst src2));
7953   effect(TEMP src1);
7954   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7955   ins_encode %{
7956     int vector_len = 0;
7957     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7958   %}
7959   ins_pipe( pipe_slow );
7960 %}
7961 
7962 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7963   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
7964   match(Set dst (MulVS src (LoadVector mem)));
7965   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7966   ins_encode %{
7967     int vector_len = 0;
7968     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7969   %}
7970   ins_pipe( pipe_slow );
7971 %}
7972 
7973 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7974   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7975   match(Set dst (MulVS src (LoadVector mem)));
7976   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7977   ins_encode %{
7978     int vector_len = 0;
7979     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7980   %}
7981   ins_pipe( pipe_slow );
7982 %}
7983 
7984 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7985   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7986   match(Set dst (MulVS dst (LoadVector mem)));
7987   effect(TEMP src);
7988   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7989   ins_encode %{
7990     int vector_len = 0;
7991     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7992   %}
7993   ins_pipe( pipe_slow );
7994 %}
7995 
7996 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7997   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7998   match(Set dst (MulVS src1 src2));
7999   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8000   ins_encode %{
8001     int vector_len = 1;
8002     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8003   %}
8004   ins_pipe( pipe_slow );
8005 %}
8006 
8007 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
8008   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8009   match(Set dst (MulVS src1 src2));
8010   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8011   ins_encode %{
8012     int vector_len = 1;
8013     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8014   %}
8015   ins_pipe( pipe_slow );
8016 %}
8017 
8018 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
8019   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8020   match(Set dst (MulVS dst src2));
8021   effect(TEMP src1);
8022   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8023   ins_encode %{
8024     int vector_len = 1;
8025     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8026   %}
8027   ins_pipe( pipe_slow );
8028 %}
8029 
8030 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
8031   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8032   match(Set dst (MulVS src (LoadVector mem)));
8033   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8034   ins_encode %{
8035     int vector_len = 1;
8036     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8037   %}
8038   ins_pipe( pipe_slow );
8039 %}
8040 
8041 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
8042   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8043   match(Set dst (MulVS src (LoadVector mem)));
8044   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8045   ins_encode %{
8046     int vector_len = 1;
8047     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8048   %}
8049   ins_pipe( pipe_slow );
8050 %}
8051 
8052 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
8053   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8054   match(Set dst (MulVS dst (LoadVector mem)));
8055   effect(TEMP src);
8056   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8057   ins_encode %{
8058     int vector_len = 1;
8059     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8060   %}
8061   ins_pipe( pipe_slow );
8062 %}
8063 
8064 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
8065   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8066   match(Set dst (MulVS src1 src2));
8067   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
8068   ins_encode %{
8069     int vector_len = 2;
8070     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8071   %}
8072   ins_pipe( pipe_slow );
8073 %}
8074 
8075 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
8076   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8077   match(Set dst (MulVS src (LoadVector mem)));
8078   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
8079   ins_encode %{
8080     int vector_len = 2;
8081     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8082   %}
8083   ins_pipe( pipe_slow );
8084 %}
8085 
8086 // Integers vector mul (sse4_1)
8087 instruct vmul2I(vecD dst, vecD src) %{
8088   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
8089   match(Set dst (MulVI dst src));
8090   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
8091   ins_encode %{
8092     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
8093   %}
8094   ins_pipe( pipe_slow );
8095 %}
8096 


8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8691   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8692   match(Set dst (SqrtVD src));
8693   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8694   ins_encode %{
8695     int vector_len = 1;
8696     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8702   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8703   match(Set dst (SqrtVD (LoadVector mem)));
8704   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8705   ins_encode %{
8706     int vector_len = 1;
8707     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8708   %}
8709   ins_pipe( pipe_slow );
8710 %}
8711 
8712 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8713   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8714   match(Set dst (SqrtVD src));
8715   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8716   ins_encode %{
8717     int vector_len = 2;
8718     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8719   %}
8720   ins_pipe( pipe_slow );
8721 %}
8722 
8723 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8724   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8725   match(Set dst (SqrtVD (LoadVector mem)));
8726   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8727   ins_encode %{
8728     int vector_len = 2;
8729     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8730   %}
8731   ins_pipe( pipe_slow );
8732 %}
8733 
8734 // ------------------------------ LeftShift -----------------------------------
8735 
8736 // Shorts/Chars vector left shift
8737 instruct vsll2S(vecS dst, vecS shift) %{
8738   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8739   match(Set dst (LShiftVS dst shift));
8740   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8741   ins_encode %{
8742     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8743   %}
8744   ins_pipe( pipe_slow );
8745 %}
8746 
8747 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8748   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8749   match(Set dst (LShiftVS dst shift));
8750   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8751   ins_encode %{
8752     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8753   %}
8754   ins_pipe( pipe_slow );
8755 %}
8756 
8757 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8758   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
8759   match(Set dst (LShiftVS src shift));
8760   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8761   ins_encode %{
8762     int vector_len = 0;
8763     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8764   %}
8765   ins_pipe( pipe_slow );
8766 %}
8767 
8768 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8769   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8770   match(Set dst (LShiftVS src shift));
8771   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8772   ins_encode %{
8773     int vector_len = 0;
8774     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8775   %}
8776   ins_pipe( pipe_slow );
8777 %}
8778 
8779 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8780   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8781   match(Set dst (LShiftVS dst shift));
8782   effect(TEMP src);
8783   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8784   ins_encode %{
8785     int vector_len = 0;
8786     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8787   %}
8788   ins_pipe( pipe_slow );
8789 %}
8790 
8791 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8792   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
8793   match(Set dst (LShiftVS src shift));
8794   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8795   ins_encode %{
8796     int vector_len = 0;
8797     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8798   %}
8799   ins_pipe( pipe_slow );
8800 %}
8801 
8802 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8803   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8804   match(Set dst (LShiftVS src shift));
8805   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8806   ins_encode %{
8807     int vector_len = 0;
8808     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8809   %}
8810   ins_pipe( pipe_slow );
8811 %}
8812 
8813 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8814   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8815   match(Set dst (LShiftVS dst shift));
8816   effect(TEMP src);
8817   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8818   ins_encode %{
8819     int vector_len = 0;
8820     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8821   %}
8822   ins_pipe( pipe_slow );
8823 %}
8824 
8825 instruct vsll4S(vecD dst, vecS shift) %{
8826   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8827   match(Set dst (LShiftVS dst shift));
8828   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8829   ins_encode %{
8830     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8831   %}
8832   ins_pipe( pipe_slow );
8833 %}
8834 
8835 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8836   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8837   match(Set dst (LShiftVS dst shift));
8838   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8839   ins_encode %{
8840     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8841   %}
8842   ins_pipe( pipe_slow );
8843 %}
8844 
8845 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8846   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
8847   match(Set dst (LShiftVS src shift));
8848   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8849   ins_encode %{
8850     int vector_len = 0;
8851     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8852   %}
8853   ins_pipe( pipe_slow );
8854 %}
8855 
8856 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8857   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8858   match(Set dst (LShiftVS src shift));
8859   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8860   ins_encode %{
8861     int vector_len = 0;
8862     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8863   %}
8864   ins_pipe( pipe_slow );
8865 %}
8866 
8867 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8868   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8869   match(Set dst (LShiftVS dst shift));
8870   effect(TEMP src);
8871   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8872   ins_encode %{
8873     int vector_len = 0;
8874     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8875   %}
8876   ins_pipe( pipe_slow );
8877 %}
8878 
8879 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8880   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
8881   match(Set dst (LShiftVS src shift));
8882   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8883   ins_encode %{
8884     int vector_len = 0;
8885     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8886   %}
8887   ins_pipe( pipe_slow );
8888 %}
8889 
8890 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8891   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8892   match(Set dst (LShiftVS src shift));
8893   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8894   ins_encode %{
8895     int vector_len = 0;
8896     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8897   %}
8898   ins_pipe( pipe_slow );
8899 %}
8900 
8901 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8902   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8903   match(Set dst (LShiftVS dst shift));
8904   effect(TEMP src);
8905   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8906   ins_encode %{
8907     int vector_len = 0;
8908     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8909   %}
8910   ins_pipe( pipe_slow );
8911 %}
8912 
8913 instruct vsll8S(vecX dst, vecS shift) %{
8914   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);



8915   match(Set dst (LShiftVS dst shift));
8916   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8917   ins_encode %{
8918     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8919   %}
8920   ins_pipe( pipe_slow );
8921 %}
8922 
8923 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8924   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8925   match(Set dst (LShiftVS dst shift));
8926   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8927   ins_encode %{
8928     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8929   %}
8930   ins_pipe( pipe_slow );
8931 %}
8932 
8933 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8934   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
8935   match(Set dst (LShiftVS src shift));
8936   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8937   ins_encode %{
8938     int vector_len = 0;
8939     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8940   %}
8941   ins_pipe( pipe_slow );
8942 %}
8943 
8944 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8945   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8946   match(Set dst (LShiftVS src shift));
8947   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8948   ins_encode %{
8949     int vector_len = 0;
8950     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8951   %}
8952   ins_pipe( pipe_slow );
8953 %}
8954 
8955 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8956   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8957   match(Set dst (LShiftVS dst shift));
8958   effect(TEMP src);
8959   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8960   ins_encode %{
8961     int vector_len = 0;
8962     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8963   %}
8964   ins_pipe( pipe_slow );
8965 %}
8966 
8967 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8968   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
8969   match(Set dst (LShiftVS src shift));
8970   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8971   ins_encode %{
8972     int vector_len = 0;
8973     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8974   %}
8975   ins_pipe( pipe_slow );
8976 %}
8977 
8978 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8979   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8980   match(Set dst (LShiftVS src shift));
8981   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8982   ins_encode %{
8983     int vector_len = 0;
8984     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8985   %}
8986   ins_pipe( pipe_slow );
8987 %}
8988 
8989 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8990   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8991   match(Set dst (LShiftVS dst shift));
8992   effect(TEMP src);
8993   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8994   ins_encode %{
8995     int vector_len = 0;
8996     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8997   %}
8998   ins_pipe( pipe_slow );
8999 %}
9000 
9001 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9002   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9003   match(Set dst (LShiftVS src shift));
9004   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9005   ins_encode %{
9006     int vector_len = 1;
9007     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9008   %}
9009   ins_pipe( pipe_slow );
9010 %}
9011 
9012 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9013   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9014   match(Set dst (LShiftVS src shift));
9015   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9016   ins_encode %{
9017     int vector_len = 1;
9018     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9019   %}
9020   ins_pipe( pipe_slow );
9021 %}
9022 
9023 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9024   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9025   match(Set dst (LShiftVS dst shift));
9026   effect(TEMP src);
9027   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9028   ins_encode %{
9029     int vector_len = 1;
9030     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9031   %}
9032   ins_pipe( pipe_slow );
9033 %}
9034 
9035 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9036   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9037   match(Set dst (LShiftVS src shift));
9038   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9039   ins_encode %{
9040     int vector_len = 1;
9041     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9042   %}
9043   ins_pipe( pipe_slow );
9044 %}
9045 
9046 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9047   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9048   match(Set dst (LShiftVS src shift));
9049   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9050   ins_encode %{
9051     int vector_len = 1;
9052     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9053   %}
9054   ins_pipe( pipe_slow );
9055 %}
9056 
9057 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9058   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9059   match(Set dst (LShiftVS dst shift));
9060   effect(TEMP src);
9061   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9062   ins_encode %{
9063     int vector_len = 1;
9064     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9065   %}
9066   ins_pipe( pipe_slow );
9067 %}
9068 
9069 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
9070   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9071   match(Set dst (LShiftVS src shift));
9072   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9073   ins_encode %{
9074     int vector_len = 2;
9075     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9076   %}
9077   ins_pipe( pipe_slow );
9078 %}
9079 
9080 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9081   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9082   match(Set dst (LShiftVS src shift));
9083   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9084   ins_encode %{
9085     int vector_len = 2;
9086     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9087   %}
9088   ins_pipe( pipe_slow );
9089 %}
9090 
9091 // Integers vector left shift
9092 instruct vsll2I(vecD dst, vecS shift) %{
9093   predicate(n->as_Vector()->length() == 2);
9094   match(Set dst (LShiftVI dst shift));
9095   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9096   ins_encode %{
9097     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9098   %}
9099   ins_pipe( pipe_slow );
9100 %}
9101 


9270   %}
9271   ins_pipe( pipe_slow );
9272 %}
9273 
9274 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9275   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9276   match(Set dst (LShiftVL src shift));
9277   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9278   ins_encode %{
9279     int vector_len = 1;
9280     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9281   %}
9282   ins_pipe( pipe_slow );
9283 %}
9284 
9285 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9286   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9287   match(Set dst (LShiftVL src shift));
9288   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9289   ins_encode %{
9290     int vector_len = 2;
9291     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9292   %}
9293   ins_pipe( pipe_slow );
9294 %}
9295 
9296 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9297   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9298   match(Set dst (LShiftVL src shift));
9299   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9300   ins_encode %{
9301     int vector_len = 2;
9302     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9303   %}
9304   ins_pipe( pipe_slow );
9305 %}
9306 
9307 // ----------------------- LogicalRightShift -----------------------------------
9308 
9309 // Shorts vector logical right shift produces incorrect Java result
9310 // for negative data because java code convert short value into int with
9311 // sign extension before a shift. But char vectors are fine since chars are
9312 // unsigned values.
9313 
9314 instruct vsrl2S(vecS dst, vecS shift) %{
9315   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9316   match(Set dst (URShiftVS dst shift));
9317   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9318   ins_encode %{
9319     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9320   %}
9321   ins_pipe( pipe_slow );
9322 %}
9323 
9324 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9325   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9326   match(Set dst (URShiftVS dst shift));
9327   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9328   ins_encode %{
9329     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9330   %}
9331   ins_pipe( pipe_slow );
9332 %}
9333 
9334 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9335   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
9336   match(Set dst (URShiftVS src shift));
9337   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9338   ins_encode %{
9339     int vector_len = 0;
9340     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9341   %}
9342   ins_pipe( pipe_slow );
9343 %}
9344 
9345 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9346   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9347   match(Set dst (URShiftVS src shift));
9348   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9349   ins_encode %{
9350     int vector_len = 0;
9351     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9352   %}
9353   ins_pipe( pipe_slow );
9354 %}
9355 
9356 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9357   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9358   match(Set dst (URShiftVS dst shift));
9359   effect(TEMP src);
9360   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9361   ins_encode %{
9362     int vector_len = 0;
9363     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9364   %}
9365   ins_pipe( pipe_slow );
9366 %}
9367 
9368 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9369   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
9370   match(Set dst (URShiftVS src shift));
9371   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9372   ins_encode %{
9373     int vector_len = 0;
9374     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9375   %}
9376   ins_pipe( pipe_slow );
9377 %}
9378 
9379 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9380   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9381   match(Set dst (URShiftVS src shift));
9382   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9383   ins_encode %{
9384     int vector_len = 0;
9385     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9386   %}
9387   ins_pipe( pipe_slow );
9388 %}
9389 
9390 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9391   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9392   match(Set dst (URShiftVS dst shift));
9393   effect(TEMP src);
9394   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9395   ins_encode %{
9396     int vector_len = 0;
9397     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9398   %}
9399   ins_pipe( pipe_slow );
9400 %}
9401 
9402 instruct vsrl4S(vecD dst, vecS shift) %{
9403   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9404   match(Set dst (URShiftVS dst shift));
9405   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9406   ins_encode %{
9407     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9408   %}
9409   ins_pipe( pipe_slow );
9410 %}
9411 
9412 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9413   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9414   match(Set dst (URShiftVS dst shift));
9415   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9416   ins_encode %{
9417     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9418   %}
9419   ins_pipe( pipe_slow );
9420 %}
9421 
9422 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9423   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
9424   match(Set dst (URShiftVS src shift));
9425   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9426   ins_encode %{
9427     int vector_len = 0;
9428     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9429   %}
9430   ins_pipe( pipe_slow );
9431 %}
9432 
9433 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9434   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9435   match(Set dst (URShiftVS src shift));
9436   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9437   ins_encode %{
9438     int vector_len = 0;
9439     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9440   %}
9441   ins_pipe( pipe_slow );
9442 %}
9443 
9444 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9445   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9446   match(Set dst (URShiftVS dst shift));
9447   effect(TEMP src);
9448   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9449   ins_encode %{
9450     int vector_len = 0;
9451     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9452   %}
9453   ins_pipe( pipe_slow );
9454 %}
9455 
9456 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9457   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
9458   match(Set dst (URShiftVS src shift));
9459   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9460   ins_encode %{
9461     int vector_len = 0;
9462     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9463   %}
9464   ins_pipe( pipe_slow );
9465 %}
9466 
9467 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9468   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9469   match(Set dst (URShiftVS src shift));
9470   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9471   ins_encode %{
9472     int vector_len = 0;
9473     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9474   %}
9475   ins_pipe( pipe_slow );
9476 %}
9477 
9478 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9479   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9480   match(Set dst (URShiftVS dst shift));
9481   effect(TEMP src);
9482   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9483   ins_encode %{
9484     int vector_len = 0;
9485     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9486   %}
9487   ins_pipe( pipe_slow );
9488 %}
9489 
9490 instruct vsrl8S(vecX dst, vecS shift) %{
9491   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);







9492   match(Set dst (URShiftVS dst shift));
9493   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9494   ins_encode %{
9495     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9496   %}
9497   ins_pipe( pipe_slow );
9498 %}
9499 
9500 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9501   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9502   match(Set dst (URShiftVS dst shift));
9503   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9504   ins_encode %{
9505     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9506   %}
9507   ins_pipe( pipe_slow );
9508 %}
9509 
9510 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9511   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
9512   match(Set dst (URShiftVS src shift));
9513   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9514   ins_encode %{
9515     int vector_len = 0;
9516     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9517   %}
9518   ins_pipe( pipe_slow );
9519 %}
9520 
9521 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9522   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9523   match(Set dst (URShiftVS src shift));
9524   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9525   ins_encode %{
9526     int vector_len = 0;
9527     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9528   %}
9529   ins_pipe( pipe_slow );
9530 %}
9531 
9532 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9533   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9534   match(Set dst (URShiftVS dst shift));
9535   effect(TEMP src);
9536   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9537   ins_encode %{
9538     int vector_len = 0;
9539     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9540   %}
9541   ins_pipe( pipe_slow );
9542 %}
9543 
9544 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9545   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
9546   match(Set dst (URShiftVS src shift));
9547   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9548   ins_encode %{
9549     int vector_len = 0;
9550     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9551   %}
9552   ins_pipe( pipe_slow );
9553 %}
9554 
9555 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9556   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9557   match(Set dst (URShiftVS src shift));
9558   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9559   ins_encode %{
9560     int vector_len = 0;
9561     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9562   %}
9563   ins_pipe( pipe_slow );
9564 %}
9565 
9566 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9567   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9568   match(Set dst (URShiftVS dst shift));
9569   effect(TEMP src);
9570   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9571   ins_encode %{
9572     int vector_len = 0;
9573     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9574   %}
9575   ins_pipe( pipe_slow );
9576 %}
9577 
9578 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9579   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9580   match(Set dst (URShiftVS src shift));
9581   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9582   ins_encode %{
9583     int vector_len = 1;
9584     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9585   %}
9586   ins_pipe( pipe_slow );
9587 %}
9588 
9589 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9590   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9591   match(Set dst (URShiftVS src shift));
9592   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9593   ins_encode %{
9594     int vector_len = 1;
9595     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9596   %}
9597   ins_pipe( pipe_slow );
9598 %}
9599 
9600 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9601   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9602   match(Set dst (URShiftVS dst shift));
9603   effect(TEMP src);
9604   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9605   ins_encode %{
9606     int vector_len = 1;
9607     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9608   %}
9609   ins_pipe( pipe_slow );
9610 %}
9611 
9612 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9613   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9614   match(Set dst (URShiftVS src shift));
9615   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9616   ins_encode %{
9617     int vector_len = 1;
9618     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9619   %}
9620   ins_pipe( pipe_slow );
9621 %}
9622 
9623 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9624   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9625   match(Set dst (URShiftVS src shift));
9626   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9627   ins_encode %{
9628     int vector_len = 1;
9629     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9630   %}
9631   ins_pipe( pipe_slow );
9632 %}
9633 
9634 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9635   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9636   match(Set dst (URShiftVS dst shift));
9637   effect(TEMP src);
9638   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9639   ins_encode %{
9640     int vector_len = 1;
9641     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9642   %}
9643   ins_pipe( pipe_slow );
9644 %}
9645 
9646 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9647   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9648   match(Set dst (URShiftVS src shift));
9649   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9650   ins_encode %{
9651     int vector_len = 2;
9652     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9653   %}
9654   ins_pipe( pipe_slow );
9655 %}
9656 
9657 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9658   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9659   match(Set dst (URShiftVS src shift));
9660   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9661   ins_encode %{
9662     int vector_len = 2;
9663     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9664   %}
9665   ins_pipe( pipe_slow );
9666 %}
9667 
9668 // Integers vector logical right shift
9669 instruct vsrl2I(vecD dst, vecS shift) %{
9670   predicate(n->as_Vector()->length() == 2);
9671   match(Set dst (URShiftVI dst shift));
9672   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9673   ins_encode %{
9674     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9675   %}
9676   ins_pipe( pipe_slow );
9677 %}
9678 


9868     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9869   %}
9870   ins_pipe( pipe_slow );
9871 %}
9872 
9873 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9874   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9875   match(Set dst (URShiftVL src shift));
9876   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9877   ins_encode %{
9878     int vector_len = 2;
9879     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9880   %}
9881   ins_pipe( pipe_slow );
9882 %}
9883 
9884 // ------------------- ArithmeticRightShift -----------------------------------
9885 
9886 // Shorts/Chars vector arithmetic right shift
9887 instruct vsra2S(vecS dst, vecS shift) %{
9888   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9889   match(Set dst (RShiftVS dst shift));
9890   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9891   ins_encode %{
9892     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9893   %}
9894   ins_pipe( pipe_slow );
9895 %}
9896 
9897 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9898   predicate(n->as_Vector()->length() == 2);
9899   match(Set dst (RShiftVS dst shift));
9900   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9901   ins_encode %{
9902     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9903   %}
9904   ins_pipe( pipe_slow );
9905 %}
9906 
9907 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9908   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
9909   match(Set dst (RShiftVS src shift));
9910   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9911   ins_encode %{
9912     int vector_len = 0;
9913     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9914   %}
9915   ins_pipe( pipe_slow );
9916 %}
9917 
9918 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9919   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9920   match(Set dst (RShiftVS src shift));
9921   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9922   ins_encode %{
9923     int vector_len = 0;
9924     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9925   %}
9926   ins_pipe( pipe_slow );
9927 %}
9928 
9929 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9930   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9931   match(Set dst (RShiftVS dst shift));
9932   effect(TEMP src);
9933   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9934   ins_encode %{
9935     int vector_len = 0;
9936     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9937   %}
9938   ins_pipe( pipe_slow );
9939 %}
9940 
9941 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9942   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 2);
9943   match(Set dst (RShiftVS src shift));
9944   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9945   ins_encode %{
9946     int vector_len = 0;
9947     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9948   %}
9949   ins_pipe( pipe_slow );
9950 %}
9951 
9952 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9953   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9954   match(Set dst (RShiftVS src shift));
9955   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9956   ins_encode %{
9957     int vector_len = 0;
9958     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9959   %}
9960   ins_pipe( pipe_slow );
9961 %}
9962 
9963 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9964   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9965   match(Set dst (RShiftVS dst shift));
9966   effect(TEMP src);
9967   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9968   ins_encode %{
9969     int vector_len = 0;
9970     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9971   %}
9972   ins_pipe( pipe_slow );
9973 %}
9974 
9975 instruct vsra4S(vecD dst, vecS shift) %{
9976   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9977   match(Set dst (RShiftVS dst shift));
9978   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9979   ins_encode %{
9980     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9981   %}
9982   ins_pipe( pipe_slow );
9983 %}
9984 
9985 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9986   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9987   match(Set dst (RShiftVS dst shift));
9988   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9989   ins_encode %{
9990     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9991   %}
9992   ins_pipe( pipe_slow );
9993 %}
9994 
9995 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9996   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
9997   match(Set dst (RShiftVS src shift));
9998   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9999   ins_encode %{
10000     int vector_len = 0;
10001     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10002   %}
10003   ins_pipe( pipe_slow );
10004 %}
10005 
10006 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
10007   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10008   match(Set dst (RShiftVS src shift));
10009   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10010   ins_encode %{
10011     int vector_len = 0;
10012     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10013   %}
10014   ins_pipe( pipe_slow );
10015 %}
10016 
10017 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
10018   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10019   match(Set dst (RShiftVS dst shift));
10020   effect(TEMP src);
10021   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10022   ins_encode %{
10023     int vector_len = 0;
10024     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10025   %}
10026   ins_pipe( pipe_slow );
10027 %}
10028 
10029 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
10030   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 4);
10031   match(Set dst (RShiftVS src shift));
10032   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10033   ins_encode %{
10034     int vector_len = 0;
10035     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10036   %}
10037   ins_pipe( pipe_slow );
10038 %}
10039 
10040 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
10041   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10042   match(Set dst (RShiftVS src shift));
10043   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10044   ins_encode %{
10045     int vector_len = 0;
10046     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10047   %}
10048   ins_pipe( pipe_slow );
10049 %}
10050 
10051 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
10052   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10053   match(Set dst (RShiftVS dst shift));
10054   effect(TEMP src);
10055   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10056   ins_encode %{
10057     int vector_len = 0;
10058     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10059   %}
10060   ins_pipe( pipe_slow );
10061 %}
10062 
10063 instruct vsra8S(vecX dst, vecS shift) %{
10064   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10065   match(Set dst (RShiftVS dst shift));
10066   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10067   ins_encode %{
10068     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10069   %}
10070   ins_pipe( pipe_slow );
10071 %}
10072 
10073 instruct vsra8S_imm(vecX dst, immI8 shift) %{
10074   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10075   match(Set dst (RShiftVS dst shift));
10076   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10077   ins_encode %{
10078     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10079   %}
10080   ins_pipe( pipe_slow );
10081 %}
10082 
10083 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
10084   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
10085   match(Set dst (RShiftVS src shift));
10086   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10087   ins_encode %{
10088     int vector_len = 0;
10089     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10090   %}
10091   ins_pipe( pipe_slow );
10092 %}
10093 
10094 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
10095   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10096   match(Set dst (RShiftVS src shift));
10097   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10098   ins_encode %{
10099     int vector_len = 0;
10100     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10101   %}
10102   ins_pipe( pipe_slow );
10103 %}
10104 
10105 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
10106   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10107   match(Set dst (RShiftVS dst shift));
10108   effect(TEMP src);
10109   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10110   ins_encode %{
10111     int vector_len = 0;
10112     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10113   %}
10114   ins_pipe( pipe_slow );
10115 %}
10116 
10117 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
10118   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 8);
10119   match(Set dst (RShiftVS src shift));
10120   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10121   ins_encode %{
10122     int vector_len = 0;
10123     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 
10128 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
10129   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10130   match(Set dst (RShiftVS src shift));
10131   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10132   ins_encode %{
10133     int vector_len = 0;
10134     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10135   %}
10136   ins_pipe( pipe_slow );
10137 %}
10138 
10139 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
10140   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10141   match(Set dst (RShiftVS dst shift));
10142   effect(TEMP src);
10143   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10144   ins_encode %{
10145     int vector_len = 0;
10146     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10147   %}
10148   ins_pipe( pipe_slow );
10149 %}
10150 
10151 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
10152   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10153   match(Set dst (RShiftVS src shift));
10154   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10155   ins_encode %{
10156     int vector_len = 1;
10157     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10158   %}
10159   ins_pipe( pipe_slow );
10160 %}
10161 
10162 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
10163   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10164   match(Set dst (RShiftVS src shift));
10165   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10166   ins_encode %{
10167     int vector_len = 1;
10168     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10169   %}
10170   ins_pipe( pipe_slow );
10171 %}
10172 
10173 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10174   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10175   match(Set dst (RShiftVS dst shift));
10176   effect(TEMP src);
10177   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10178   ins_encode %{
10179     int vector_len = 1;
10180     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10181   %}
10182   ins_pipe( pipe_slow );
10183 %}
10184 
10185 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10186   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10187   match(Set dst (RShiftVS src shift));
10188   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10189   ins_encode %{
10190     int vector_len = 1;
10191     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10192   %}
10193   ins_pipe( pipe_slow );
10194 %}
10195 
10196 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10197   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10198   match(Set dst (RShiftVS src shift));
10199   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10200   ins_encode %{
10201     int vector_len = 1;
10202     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10203   %}
10204   ins_pipe( pipe_slow );
10205 %}
10206 
10207 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10208   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10209   match(Set dst (RShiftVS dst shift));
10210   effect(TEMP src);
10211   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10212   ins_encode %{
10213     int vector_len = 1;
10214     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10215   %}
10216   ins_pipe( pipe_slow );
10217 %}
10218 
10219 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
10220   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10221   match(Set dst (RShiftVS src shift));
10222   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10223   ins_encode %{
10224     int vector_len = 2;
10225     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10226   %}
10227   ins_pipe( pipe_slow );
10228 %}
10229 
10230 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10231   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10232   match(Set dst (RShiftVS src shift));
10233   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10234   ins_encode %{
10235     int vector_len = 2;
10236     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10237   %}
10238   ins_pipe( pipe_slow );
10239 %}
10240 
10241 // Integers vector arithmetic right shift
10242 instruct vsra2I(vecD dst, vecS shift) %{
10243   predicate(n->as_Vector()->length() == 2);
10244   match(Set dst (RShiftVI dst shift));
10245   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10246   ins_encode %{
10247     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10248   %}
10249   ins_pipe( pipe_slow );
10250 %}
10251 


< prev index next >