< prev index next >

src/cpu/x86/vm/x86.ad

Print this page




1699     case Op_SqrtVD:
1700       if (UseAVX < 1) // enabled for AVX only
1701         ret_value = false;
1702       break;
1703     case Op_CompareAndSwapL:
1704 #ifdef _LP64
1705     case Op_CompareAndSwapP:
1706 #endif
1707       if (!VM_Version::supports_cx8())
1708         ret_value = false;
1709       break;
1710     case Op_CMoveVD:
1711       if (UseAVX < 1 || UseAVX > 2)
1712         ret_value = false;
1713       break;
1714   }
1715 
1716   return ret_value;  // Per default match rules are supported.
1717 }
1718 






























1719 const int Matcher::float_pressure(int default_pressure_threshold) {
1720   int float_pressure_threshold = default_pressure_threshold;
1721 #ifdef _LP64
1722   if (UseAVX > 2) {
1723     // Increase pressure threshold on machines with AVX3 which have
1724     // 2x more XMM registers.
1725     float_pressure_threshold = default_pressure_threshold * 2;
1726   }
1727 #endif
1728   return float_pressure_threshold;
1729 }
1730 
1731 // Max vector size in bytes. 0 if not supported.
1732 const int Matcher::vector_width_in_bytes(BasicType bt) {
1733   assert(is_java_primitive(bt), "only primitive type vectors");
1734   if (UseSSE < 2) return 0;
1735   // SSE2 supports 128bit vectors for all types.
1736   // AVX2 supports 256bit vectors for all types.
1737   // AVX2/EVEX supports 512bit vectors for all types.
1738   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;


1742   // Use flag to limit vector size.
1743   size = MIN2(size,(int)MaxVectorSize);
1744   // Minimum 2 values in vector (or 4 for bytes).
1745   switch (bt) {
1746   case T_DOUBLE:
1747   case T_LONG:
1748     if (size < 16) return 0;
1749     break;
1750   case T_FLOAT:
1751   case T_INT:
1752     if (size < 8) return 0;
1753     break;
1754   case T_BOOLEAN:
1755     if (size < 4) return 0;
1756     break;
1757   case T_CHAR:
1758     if (size < 4) return 0;
1759     break;
1760   case T_BYTE:
1761     if (size < 4) return 0;
1762     if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
1763     break;
1764   case T_SHORT:
1765     if (size < 4) return 0;
1766     if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
1767     break;
1768   default:
1769     ShouldNotReachHere();
1770   }
1771   return size;
1772 }
1773 
1774 // Limits on vector size (number of elements) loaded into vector.
1775 const int Matcher::max_vector_size(const BasicType bt) {
1776   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1777 }
1778 const int Matcher::min_vector_size(const BasicType bt) {
1779   int max_size = max_vector_size(bt);
1780   // Min size which can be loaded into vector is 4 bytes.
1781   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1782   return MIN2(size,max_size);
1783 }
1784 
1785 // Vector ideal reg corresponding to specidied size in bytes
1786 const int Matcher::vector_ideal_reg(int size) {


1950         break;
1951       case Op_VecD:
1952         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1953         break;
1954        case Op_VecX:
1955         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1956         break;
1957       case Op_VecY:
1958       case Op_VecZ:
1959         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1960         break;
1961       default:
1962         ShouldNotReachHere();
1963       }
1964     }
1965 #endif
1966   }
1967   bool is_single_byte = false;
1968   int vec_len = 0;
1969   if ((UseAVX > 2) && (stack_offset != 0)) {


1970     switch (ireg) {
1971         case Op_VecS:


1972     case Op_VecD:



1973     case Op_VecX:
1974           break;
1975         case Op_VecY:
1976           vec_len = 1;
1977           break;
1978     case Op_VecZ:
1979           vec_len = 2;
1980           break;
1981     }
1982     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
1983   }
1984   int offset_size = 0;
1985   int size = 5;
1986   if (UseAVX > 2 ) {
1987     if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { 
1988       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1989       size += 2; // Need an additional two bytes for EVEX encoding
1990     } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { 
1991       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1992     } else {
1993       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1994       size += 2; // Need an additional two bytes for EVEX encodding
1995     }
1996   } else {
1997     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1998   }
1999   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2000   return size+offset_size;
2001 }
2002 
2003 static inline jfloat replicate4_imm(int con, int width) {
2004   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2005   assert(width == 1 || width == 2, "only byte or short types here");
2006   int bit_width = width * 8;
2007   jint val = con;
2008   val &= (1 << bit_width) - 1;  // mask off sign bits
2009   while(bit_width < 32) {
2010     val |= (val << bit_width);


2694   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2695   ins_cost(150);
2696   ins_encode %{
2697     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2698   %}
2699   ins_pipe(pipe_slow);
2700 %}
2701 
2702 instruct absF_reg(regF dst) %{
2703   predicate((UseSSE>=1) && (UseAVX == 0));
2704   match(Set dst (AbsF dst));
2705   ins_cost(150);
2706   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2707   ins_encode %{
2708     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2709   %}
2710   ins_pipe(pipe_slow);
2711 %}
2712 
2713 instruct absF_reg_reg(regF dst, regF src) %{
2714   predicate(UseAVX > 0);









































2715   match(Set dst (AbsF src));
2716   ins_cost(150);
2717   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2718   ins_encode %{
2719     int vector_len = 0;
2720     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2721               ExternalAddress(float_signmask()), vector_len);
2722   %}
2723   ins_pipe(pipe_slow);
2724 %}

2725 
2726 instruct absD_reg(regD dst) %{
2727   predicate((UseSSE>=2) && (UseAVX == 0));
2728   match(Set dst (AbsD dst));
2729   ins_cost(150);
2730   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2731             "# abs double by sign masking" %}
2732   ins_encode %{
2733     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2734   %}
2735   ins_pipe(pipe_slow);
2736 %}
2737 
2738 instruct absD_reg_reg(regD dst, regD src) %{
2739   predicate(UseAVX > 0);











































2740   match(Set dst (AbsD src));
2741   ins_cost(150);
2742   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2743             "# abs double by sign masking" %}
2744   ins_encode %{
2745     int vector_len = 0;
2746     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2747               ExternalAddress(double_signmask()), vector_len);
2748   %}
2749   ins_pipe(pipe_slow);
2750 %}

2751 
2752 instruct negF_reg(regF dst) %{
2753   predicate((UseSSE>=1) && (UseAVX == 0));
2754   match(Set dst (NegF dst));
2755   ins_cost(150);
2756   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2757   ins_encode %{
2758     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2759   %}
2760   ins_pipe(pipe_slow);
2761 %}
2762 
2763 instruct negF_reg_reg(regF dst, regF src) %{
2764   predicate(UseAVX > 0);
2765   match(Set dst (NegF src));
2766   ins_cost(150);
2767   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2768   ins_encode %{
2769     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2770                  ExternalAddress(float_signflip()));


4537 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4538   predicate(UseSSE > 2 && UseAVX == 0);
4539   match(Set dst (AddReductionVI src1 src2));
4540   effect(TEMP tmp2, TEMP tmp);
4541   format %{ "movdqu  $tmp2,$src2\n\t"
4542             "phaddd  $tmp2,$tmp2\n\t"
4543             "movd    $tmp,$src1\n\t"
4544             "paddd   $tmp,$tmp2\n\t"
4545             "movd    $dst,$tmp\t! add reduction2I" %}
4546   ins_encode %{
4547     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4548     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4549     __ movdl($tmp$$XMMRegister, $src1$$Register);
4550     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4551     __ movdl($dst$$Register, $tmp$$XMMRegister);
4552   %}
4553   ins_pipe( pipe_slow );
4554 %}
4555 
4556 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4557   predicate(UseAVX > 0 && UseAVX < 3);
4558   match(Set dst (AddReductionVI src1 src2));
4559   effect(TEMP tmp, TEMP tmp2);
4560   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4561             "movd     $tmp2,$src1\n\t"
4562             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4563             "movd     $dst,$tmp2\t! add reduction2I" %}
4564   ins_encode %{
4565     int vector_len = 0;
4566     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4567     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4568     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4569     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4570   %}
4571   ins_pipe( pipe_slow );
4572 %}
4573 
4574 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4575   predicate(UseAVX > 2);
4576   match(Set dst (AddReductionVI src1 src2));
4577   effect(TEMP tmp, TEMP tmp2);
4578   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4579             "vpaddd  $tmp,$src2,$tmp2\n\t"
4580             "movd    $tmp2,$src1\n\t"
4581             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4582             "movd    $dst,$tmp2\t! add reduction2I" %}
4583   ins_encode %{
4584     int vector_len = 0;
4585     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4586     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4587     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4588     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4589     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4590   %}
4591   ins_pipe( pipe_slow );
4592 %}
4593 
4594 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4595   predicate(UseSSE > 2 && UseAVX == 0);
4596   match(Set dst (AddReductionVI src1 src2));
4597   effect(TEMP tmp2, TEMP tmp);
4598   format %{ "movdqu  $tmp2,$src2\n\t"
4599             "phaddd  $tmp2,$tmp2\n\t"
4600             "phaddd  $tmp2,$tmp2\n\t"
4601             "movd    $tmp,$src1\n\t"
4602             "paddd   $tmp,$tmp2\n\t"
4603             "movd    $dst,$tmp\t! add reduction4I" %}
4604   ins_encode %{
4605     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4606     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4607     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4608     __ movdl($tmp$$XMMRegister, $src1$$Register);
4609     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4610     __ movdl($dst$$Register, $tmp$$XMMRegister);
4611   %}
4612   ins_pipe( pipe_slow );
4613 %}
4614 
4615 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4616   predicate(UseAVX > 0 && UseAVX < 3);
4617   match(Set dst (AddReductionVI src1 src2));
4618   effect(TEMP tmp, TEMP tmp2);
4619   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4620             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4621             "movd     $tmp2,$src1\n\t"
4622             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4623             "movd     $dst,$tmp2\t! add reduction4I" %}
4624   ins_encode %{
4625     int vector_len = 0;
4626     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4627     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4628     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4629     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4630     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4631   %}
4632   ins_pipe( pipe_slow );
4633 %}
4634 
4635 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4636   predicate(UseAVX > 2);
4637   match(Set dst (AddReductionVI src1 src2));
4638   effect(TEMP tmp, TEMP tmp2);
4639   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4640             "vpaddd  $tmp,$src2,$tmp2\n\t"
4641             "pshufd  $tmp2,$tmp,0x1\n\t"
4642             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4643             "movd    $tmp2,$src1\n\t"
4644             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4645             "movd    $dst,$tmp2\t! add reduction4I" %}
4646   ins_encode %{
4647     int vector_len = 0;
4648     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4649     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4650     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4651     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4652     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4653     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4654     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4655   %}
4656   ins_pipe( pipe_slow );
4657 %}
4658 
4659 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4660   predicate(UseAVX > 0 && UseAVX < 3);
4661   match(Set dst (AddReductionVI src1 src2));
4662   effect(TEMP tmp, TEMP tmp2);
4663   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4664             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4665             "vextracti128  $tmp2,$tmp\n\t"
4666             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4667             "movd     $tmp2,$src1\n\t"
4668             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4669             "movd     $dst,$tmp2\t! add reduction8I" %}
4670   ins_encode %{
4671     int vector_len = 1;
4672     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4673     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4674     __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
4675     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4676     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4677     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4678     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4679   %}
4680   ins_pipe( pipe_slow );


4695             "movd    $dst,$tmp2\t! add reduction8I" %}
4696   ins_encode %{
4697     int vector_len = 0;
4698     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4699     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4700     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4701     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4702     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4703     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4704     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4705     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4706     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4712   predicate(UseAVX > 2);
4713   match(Set dst (AddReductionVI src1 src2));
4714   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4715   format %{ "vextracti64x4  $tmp3,$src2\n\t"
4716             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4717             "vextracti128   $tmp,$tmp3\n\t"
4718             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4719             "pshufd  $tmp2,$tmp,0xE\n\t"
4720             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4721             "pshufd  $tmp2,$tmp,0x1\n\t"
4722             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4723             "movd    $tmp2,$src1\n\t"
4724             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4725             "movd    $dst,$tmp2\t! mul reduction16I" %}
4726   ins_encode %{
4727     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister);
4728     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4729     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
4730     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4731     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4732     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4733     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4734     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4735     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4736     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4737     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4738   %}
4739   ins_pipe( pipe_slow );
4740 %}
4741 
4742 #ifdef _LP64
4743 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4744   predicate(UseAVX > 2);
4745   match(Set dst (AddReductionVL src1 src2));
4746   effect(TEMP tmp, TEMP tmp2);
4747   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4748             "vpaddq  $tmp,$src2,$tmp2\n\t"
4749             "movdq   $tmp2,$src1\n\t"
4750             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4751             "movdq   $dst,$tmp2\t! add reduction2L" %}
4752   ins_encode %{
4753     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4754     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4755     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4756     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4757     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4758   %}
4759   ins_pipe( pipe_slow );
4760 %}
4761 
4762 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4763   predicate(UseAVX > 2);
4764   match(Set dst (AddReductionVL src1 src2));
4765   effect(TEMP tmp, TEMP tmp2);
4766   format %{ "vextracti64x2  $tmp,$src2, 0x1\n\t"
4767             "vpaddq  $tmp2,$tmp,$src2\n\t"
4768             "pshufd  $tmp,$tmp2,0xE\n\t"
4769             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4770             "movdq   $tmp,$src1\n\t"
4771             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4772             "movdq   $dst,$tmp2\t! add reduction4L" %}
4773   ins_encode %{
4774     __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
4775     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4776     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4777     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4778     __ movdq($tmp$$XMMRegister, $src1$$Register);
4779     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4780     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4781   %}
4782   ins_pipe( pipe_slow );
4783 %}
4784 
4785 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4786   predicate(UseAVX > 2);
4787   match(Set dst (AddReductionVL src1 src2));
4788   effect(TEMP tmp, TEMP tmp2);
4789   format %{ "vextracti64x4  $tmp2,$src2\n\t"
4790             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4791             "vextracti128   $tmp,$tmp2\n\t"
4792             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4793             "pshufd  $tmp,$tmp2,0xE\n\t"
4794             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4795             "movdq   $tmp,$src1\n\t"
4796             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4797             "movdq   $dst,$tmp2\t! add reduction8L" %}
4798   ins_encode %{
4799     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister);
4800     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4801     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
4802     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4803     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4804     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4805     __ movdq($tmp$$XMMRegister, $src1$$Register);
4806     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4807     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 #endif
4812 
4813 instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
4814   predicate(UseSSE >= 1 && UseAVX == 0);
4815   match(Set dst (AddReductionVF src1 src2));
4816   effect(TEMP tmp, TEMP tmp2);
4817   format %{ "movdqu  $tmp,$src1\n\t"
4818             "addss   $tmp,$src2\n\t"
4819             "pshufd  $tmp2,$src2,0x01\n\t"
4820             "addss   $tmp,$tmp2\n\t"
4821             "movdqu  $dst,$tmp\t! add reduction2F" %}
4822   ins_encode %{
4823     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
4824     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
4825     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
4826     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4827     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
4828   %}
4829   ins_pipe( pipe_slow );
4830 %}
4831 
4832 instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
4833   predicate(UseAVX > 0);
4834   match(Set dst (AddReductionVF src1 src2));
4835   effect(TEMP tmp2, TEMP tmp);
4836   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4837             "pshufd  $tmp,$src2,0x01\n\t"
4838             "vaddss  $dst,$tmp2,$tmp\t! add reduction2F" %}
4839   ins_encode %{
4840     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4841     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4842     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4843   %}
4844   ins_pipe( pipe_slow );
4845 %}
4846 
4847 instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
4848   predicate(UseSSE >= 1 && UseAVX == 0);
4849   match(Set dst (AddReductionVF src1 src2));
4850   effect(TEMP tmp, TEMP tmp2);
4851   format %{ "movdqu  $tmp,$src1\n\t"
4852             "addss   $tmp,$src2\n\t"
4853             "pshufd  $tmp2,$src2,0x01\n\t"
4854             "addss   $tmp,$tmp2\n\t"
4855             "pshufd  $tmp2,$src2,0x02\n\t"
4856             "addss   $tmp,$tmp2\n\t"
4857             "pshufd  $tmp2,$src2,0x03\n\t"
4858             "addss   $tmp,$tmp2\n\t"
4859             "movdqu  $dst,$tmp\t! add reduction4F" %}
4860   ins_encode %{
4861     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
4862     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
4863     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
4864     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4865     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
4866     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4867     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
4868     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
4869     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
4870   %}
4871   ins_pipe( pipe_slow );
4872 %}
4873 
4874 instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
4875   predicate(UseAVX > 0);
4876   match(Set dst (AddReductionVF src1 src2));
4877   effect(TEMP tmp, TEMP tmp2);
4878   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4879             "pshufd  $tmp,$src2,0x01\n\t"
4880             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4881             "pshufd  $tmp,$src2,0x02\n\t"
4882             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4883             "pshufd  $tmp,$src2,0x03\n\t"
4884             "vaddss  $dst,$tmp2,$tmp\t! add reduction4F" %}
4885   ins_encode %{
4886     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4887     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4888     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4889     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4890     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4891     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4892     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4893   %}
4894   ins_pipe( pipe_slow );
4895 %}
4896 
4897 instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
4898   predicate(UseAVX > 0);
4899   match(Set dst (AddReductionVF src1 src2));
4900   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4901   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4902             "pshufd  $tmp,$src2,0x01\n\t"
4903             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4904             "pshufd  $tmp,$src2,0x02\n\t"
4905             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4906             "pshufd  $tmp,$src2,0x03\n\t"
4907             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4908             "vextractf128  $tmp3,$src2\n\t"
4909             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4910             "pshufd  $tmp,$tmp3,0x01\n\t"
4911             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4912             "pshufd  $tmp,$tmp3,0x02\n\t"
4913             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4914             "pshufd  $tmp,$tmp3,0x03\n\t"
4915             "vaddss  $dst,$tmp2,$tmp\t! add reduction8F" %}
4916   ins_encode %{
4917     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4918     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4919     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4920     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4921     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4922     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4923     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4924     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
4925     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4926     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4927     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4928     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
4929     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4930     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
4931     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4932   %}
4933   ins_pipe( pipe_slow );
4934 %}
4935 
4936 instruct radd16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4937   predicate(UseAVX > 2);
4938   match(Set dst (AddReductionVF src1 src2));
4939   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4940   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
4941             "pshufd  $tmp,$src2,0x01\n\t"
4942             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4943             "pshufd  $tmp,$src2,0x02\n\t"
4944             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4945             "pshufd  $tmp,$src2,0x03\n\t"
4946             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4947             "vextractf64x2  $tmp3,$src2, 0x1\n\t"
4948             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4949             "pshufd  $tmp,$tmp3,0x01\n\t"
4950             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4951             "pshufd  $tmp,$tmp3,0x02\n\t"
4952             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4953             "pshufd  $tmp,$tmp3,0x03\n\t"
4954             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4955             "vextractf64x2  $tmp3,$src2, 0x2\n\t"
4956             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4957             "pshufd  $tmp,$tmp3,0x01\n\t"
4958             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4959             "pshufd  $tmp,$tmp3,0x02\n\t"
4960             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4961             "pshufd  $tmp,$tmp3,0x03\n\t"
4962             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4963             "vextractf64x2  $tmp3,$src2, 0x3\n\t"
4964             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
4965             "pshufd  $tmp,$tmp3,0x01\n\t"
4966             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4967             "pshufd  $tmp,$tmp3,0x02\n\t"
4968             "vaddss  $tmp2,$tmp2,$tmp\n\t"
4969             "pshufd  $tmp,$tmp3,0x03\n\t"
4970             "vaddss  $dst,$tmp2,$tmp\t! add reduction16F" %}
4971   ins_encode %{
4972     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
4973     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4974     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4975     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4976     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4977     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4978     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4979     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
4980     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4981     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4982     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4983     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
4984     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4985     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
4986     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4987     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
4988     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4989     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4990     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4991     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
4992     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4993     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
4994     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4995     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
4996     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
4997     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
4998     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
4999     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5000     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5001     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5002     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5003   %}
5004   ins_pipe( pipe_slow );
5005 %}
5006 
5007 instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
5008   predicate(UseSSE >= 1 && UseAVX == 0);
5009   match(Set dst (AddReductionVD src1 src2));
5010   effect(TEMP tmp, TEMP dst);
5011   format %{ "movdqu  $tmp,$src1\n\t"
5012             "addsd   $tmp,$src2\n\t"
5013             "pshufd  $dst,$src2,0xE\n\t"
5014             "addsd   $dst,$tmp\t! add reduction2D" %}
5015   ins_encode %{
5016     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5017     __ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
5018     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
5019     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5020   %}
5021   ins_pipe( pipe_slow );
5022 %}
5023 
5024 instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
5025   predicate(UseAVX > 0);
5026   match(Set dst (AddReductionVD src1 src2));
5027   effect(TEMP tmp, TEMP tmp2);
5028   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
5029             "pshufd  $tmp,$src2,0xE\n\t"
5030             "vaddsd  $dst,$tmp2,$tmp\t! add reduction2D" %}
5031   ins_encode %{
5032     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5033     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5034     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5035   %}
5036   ins_pipe( pipe_slow );
5037 %}
5038 
5039 instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
5040   predicate(UseAVX > 0);
5041   match(Set dst (AddReductionVD src1 src2));
5042   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5043   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
5044             "pshufd  $tmp,$src2,0xE\n\t"
5045             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5046             "vextractf128  $tmp3,$src2\n\t"
5047             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5048             "pshufd  $tmp,$tmp3,0xE\n\t"
5049             "vaddsd  $dst,$tmp2,$tmp\t! add reduction4D" %}
5050   ins_encode %{
5051     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5052     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5053     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5054     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
5055     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5056     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5057     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5058   %}
5059   ins_pipe( pipe_slow );
5060 %}
5061 
5062 instruct rvadd8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{
5063   predicate(UseAVX > 2);
5064   match(Set dst (AddReductionVD src1 src2));
5065   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5066   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
5067             "pshufd  $tmp,$src2,0xE\n\t"
5068             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5069             "vextractf64x2  $tmp3,$src2, 0x1\n\t"
5070             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5071             "pshufd  $tmp,$tmp3,0xE\n\t"
5072             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5073             "vextractf64x2  $tmp3,$src2, 0x2\n\t"
5074             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5075             "pshufd  $tmp,$tmp3,0xE\n\t"
5076             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
5077             "vextractf64x2  $tmp3,$src2, 0x3\n\t"
5078             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
5079             "pshufd  $tmp,$tmp3,0xE\n\t"
5080             "vaddsd  $dst,$tmp2,$tmp\t! add reduction8D" %}
5081   ins_encode %{
5082     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5083     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5084     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5085     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
5086     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5087     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5088     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5089     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
5090     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5091     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5092     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5093     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
5094     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5095     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5096     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5097   %}
5098   ins_pipe( pipe_slow );
5099 %}
5100 
5101 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5102   predicate(UseSSE > 3 && UseAVX == 0);
5103   match(Set dst (MulReductionVI src1 src2));
5104   effect(TEMP tmp, TEMP tmp2);
5105   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5106             "pmulld  $tmp2,$src2\n\t"
5107             "movd    $tmp,$src1\n\t"
5108             "pmulld  $tmp2,$tmp\n\t"
5109             "movd    $dst,$tmp2\t! mul reduction2I" %}
5110   ins_encode %{
5111     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5112     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5113     __ movdl($tmp$$XMMRegister, $src1$$Register);
5114     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5115     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5116   %}


5199             "movd     $dst,$tmp2\t! mul reduction8I" %}
5200   ins_encode %{
5201     int vector_len = 0;
5202     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5203     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5204     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5205     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5206     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5207     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5208     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5209     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5210     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5211   %}
5212   ins_pipe( pipe_slow );
5213 %}
5214 
5215 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5216   predicate(UseAVX > 2);
5217   match(Set dst (MulReductionVI src1 src2));
5218   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5219   format %{ "vextracti64x4  $tmp3,$src2\n\t"
5220             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5221             "vextracti128   $tmp,$tmp3\n\t"
5222             "vpmulld  $tmp,$tmp,$src2\n\t"
5223             "pshufd   $tmp2,$tmp,0xE\n\t"
5224             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5225             "pshufd   $tmp2,$tmp,0x1\n\t"
5226             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5227             "movd     $tmp2,$src1\n\t"
5228             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5229             "movd     $dst,$tmp2\t! mul reduction16I" %}
5230   ins_encode %{
5231     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister);
5232     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5233     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
5234     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5235     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5236     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5237     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5238     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5239     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5240     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5241     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5242   %}
5243   ins_pipe( pipe_slow );
5244 %}
5245 
5246 #ifdef _LP64
5247 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5248   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5249   match(Set dst (MulReductionVL src1 src2));
5250   effect(TEMP tmp, TEMP tmp2);
5251   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5252             "vpmullq  $tmp,$src2,$tmp2\n\t"
5253             "movdq    $tmp2,$src1\n\t"
5254             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5255             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5256   ins_encode %{
5257     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5258     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5259     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5260     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5261     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5262   %}
5263   ins_pipe( pipe_slow );
5264 %}
5265 
5266 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5267   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5268   match(Set dst (MulReductionVL src1 src2));
5269   effect(TEMP tmp, TEMP tmp2);
5270   format %{ "vextracti64x2  $tmp,$src2, 0x1\n\t"
5271             "vpmullq  $tmp2,$tmp,$src2\n\t"
5272             "pshufd   $tmp,$tmp2,0xE\n\t"
5273             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5274             "movdq    $tmp,$src1\n\t"
5275             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5276             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5277   ins_encode %{
5278     __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
5279     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5280     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5281     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5282     __ movdq($tmp$$XMMRegister, $src1$$Register);
5283     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5284     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5285   %}
5286   ins_pipe( pipe_slow );
5287 %}
5288 
5289 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5290   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5291   match(Set dst (MulReductionVL src1 src2));
5292   effect(TEMP tmp, TEMP tmp2);
5293   format %{ "vextracti64x4  $tmp2,$src2\n\t"
5294             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5295             "vextracti128   $tmp,$tmp2\n\t"
5296             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5297             "pshufd   $tmp,$tmp2,0xE\n\t"
5298             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5299             "movdq    $tmp,$src1\n\t"
5300             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5301             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5302   ins_encode %{
5303     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister);
5304     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5305     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
5306     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5307     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5308     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5309     __ movdq($tmp$$XMMRegister, $src1$$Register);
5310     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5311     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5312   %}
5313   ins_pipe( pipe_slow );
5314 %}
5315 #endif
5316 
5317 instruct rsmul2F_reduction(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
5318   predicate(UseSSE >= 1 && UseAVX == 0);
5319   match(Set dst (MulReductionVF src1 src2));
5320   effect(TEMP tmp, TEMP tmp2);
5321   format %{ "movdqu  $tmp,$src1\n\t"
5322             "mulss   $tmp,$src2\n\t"
5323             "pshufd  $tmp2,$src2,0x01\n\t"
5324             "mulss   $tmp,$tmp2\n\t"
5325             "movdqu  $dst,$tmp\t! mul reduction2F" %}
5326   ins_encode %{
5327     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5328     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
5329     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
5330     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5331     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
5332   %}
5333   ins_pipe( pipe_slow );
5334 %}
5335 
5336 instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
5337   predicate(UseAVX > 0);
5338   match(Set dst (MulReductionVF src1 src2));
5339   effect(TEMP tmp, TEMP tmp2);
5340   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5341             "pshufd  $tmp,$src2,0x01\n\t"
5342             "vmulss  $dst,$tmp2,$tmp\t! mul reduction2F" %}
5343   ins_encode %{
5344     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5345     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5346     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5347   %}
5348   ins_pipe( pipe_slow );
5349 %}
5350 
5351 instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
5352   predicate(UseSSE >= 1 && UseAVX == 0);
5353   match(Set dst (MulReductionVF src1 src2));
5354   effect(TEMP tmp, TEMP tmp2);
5355   format %{ "movdqu  $tmp,$src1\n\t"
5356             "mulss   $tmp,$src2\n\t"
5357             "pshufd  $tmp2,$src2,0x01\n\t"
5358             "mulss   $tmp,$tmp2\n\t"
5359             "pshufd  $tmp2,$src2,0x02\n\t"
5360             "mulss   $tmp,$tmp2\n\t"
5361             "pshufd  $tmp2,$src2,0x03\n\t"
5362             "mulss   $tmp,$tmp2\n\t"
5363             "movdqu  $dst,$tmp\t! mul reduction4F" %}
5364   ins_encode %{
5365     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5366     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
5367     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
5368     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5369     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
5370     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5371     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
5372     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
5373     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
5374   %}
5375   ins_pipe( pipe_slow );
5376 %}
5377 
5378 instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
5379   predicate(UseAVX > 0);
5380   match(Set dst (MulReductionVF src1 src2));
5381   effect(TEMP tmp, TEMP tmp2);
5382   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5383             "pshufd  $tmp,$src2,0x01\n\t"
5384             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5385             "pshufd  $tmp,$src2,0x02\n\t"
5386             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5387             "pshufd  $tmp,$src2,0x03\n\t"
5388             "vmulss  $dst,$tmp2,$tmp\t! mul reduction4F" %}
5389   ins_encode %{
5390     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5391     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5392     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5393     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5394     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5395     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5396     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5397   %}
5398   ins_pipe( pipe_slow );
5399 %}
5400 
5401 instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
5402   predicate(UseAVX > 0);
5403   match(Set dst (MulReductionVF src1 src2));
5404   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5405   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5406             "pshufd  $tmp,$src2,0x01\n\t"
5407             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5408             "pshufd  $tmp,$src2,0x02\n\t"
5409             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5410             "pshufd  $tmp,$src2,0x03\n\t"
5411             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5412             "vextractf128  $tmp3,$src2\n\t"
5413             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5414             "pshufd  $tmp,$tmp3,0x01\n\t"
5415             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5416             "pshufd  $tmp,$tmp3,0x02\n\t"
5417             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5418             "pshufd  $tmp,$tmp3,0x03\n\t"
5419             "vmulss  $dst,$tmp2,$tmp\t! mul reduction8F" %}
5420   ins_encode %{
5421     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5422     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5423     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5424     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5425     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5426     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5427     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5428     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
5429     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5430     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5431     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5432     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5433     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5434     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5435     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5436   %}
5437   ins_pipe( pipe_slow );
5438 %}
5439 
5440 instruct rvmul16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5441   predicate(UseAVX > 2);
5442   match(Set dst (MulReductionVF src1 src2));
5443   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5444   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
5445             "pshufd  $tmp,$src2,0x01\n\t"
5446             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5447             "pshufd  $tmp,$src2,0x02\n\t"
5448             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5449             "pshufd  $tmp,$src2,0x03\n\t"
5450             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5451             "vextractf32x4  $tmp3,$src2, 0x1\n\t"
5452             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5453             "pshufd  $tmp,$tmp3,0x01\n\t"
5454             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5455             "pshufd  $tmp,$tmp3,0x02\n\t"
5456             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5457             "pshufd  $tmp,$tmp3,0x03\n\t"
5458             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5459             "vextractf32x4  $tmp3,$src2, 0x2\n\t"
5460             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5461             "pshufd  $tmp,$tmp3,0x01\n\t"
5462             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5463             "pshufd  $tmp,$tmp3,0x02\n\t"
5464             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5465             "pshufd  $tmp,$tmp3,0x03\n\t"
5466             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5467             "vextractf32x4  $tmp3,$src2, 0x3\n\t"
5468             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
5469             "pshufd  $tmp,$tmp3,0x01\n\t"
5470             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5471             "pshufd  $tmp,$tmp3,0x02\n\t"
5472             "vmulss  $tmp2,$tmp2,$tmp\n\t"
5473             "pshufd  $tmp,$tmp3,0x03\n\t"
5474             "vmulss  $dst,$tmp2,$tmp\t! mul reduction16F" %}
5475   ins_encode %{
5476     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5477     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5478     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5479     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5480     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5481     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5482     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5483     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
5484     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5485     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5486     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5487     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5488     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5489     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5490     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5491     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
5492     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5493     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5494     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5495     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5496     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5497     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5498     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5499     __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
5500     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5501     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
5502     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5503     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
5504     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5505     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
5506     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5507   %}
5508   ins_pipe( pipe_slow );
5509 %}
5510 
5511 instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
5512   predicate(UseSSE >= 1 && UseAVX == 0);
5513   match(Set dst (MulReductionVD src1 src2));
5514   effect(TEMP tmp, TEMP dst);
5515   format %{ "movdqu  $tmp,$src1\n\t"
5516             "mulsd   $tmp,$src2\n\t"
5517             "pshufd  $dst,$src2,0xE\n\t"
5518             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5519   ins_encode %{
5520     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
5521     __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
5522     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
5523     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5524   %}
5525   ins_pipe( pipe_slow );
5526 %}
5527 
5528 instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
5529   predicate(UseAVX > 0);
5530   match(Set dst (MulReductionVD src1 src2));
5531   effect(TEMP tmp, TEMP tmp2);
5532   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
5533             "pshufd  $tmp,$src2,0xE\n\t"
5534             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction2D" %}
5535   ins_encode %{
5536     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5537     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5538     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5539   %}
5540   ins_pipe( pipe_slow );
5541 %}
5542 
5543 instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
5544   predicate(UseAVX > 0);
5545   match(Set dst (MulReductionVD src1 src2));
5546   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5547   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
5548             "pshufd  $tmp,$src2,0xE\n\t"
5549             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5550             "vextractf128  $tmp3,$src2\n\t"
5551             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5552             "pshufd  $tmp,$tmp3,0xE\n\t"
5553             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction4D" %}
5554   ins_encode %{
5555     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5556     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5557     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5558     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
5559     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5560     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5561     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5562   %}
5563   ins_pipe( pipe_slow );
5564 %}
5565 
5566 instruct rvmul8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{
5567   predicate(UseAVX > 2);
5568   match(Set dst (MulReductionVD src1 src2));
5569   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5570   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
5571             "pshufd  $tmp,$src2,0xE\n\t"
5572             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5573             "vextractf64x2  $tmp3,$src2, 0x1\n\t"
5574             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5575             "pshufd  $tmp,$src2,0xE\n\t"
5576             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5577             "vextractf64x2  $tmp3,$src2, 0x2\n\t"
5578             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5579             "pshufd  $tmp,$tmp3,0xE\n\t"
5580             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
5581             "vextractf64x2  $tmp3,$src2, 0x3\n\t"
5582             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
5583             "pshufd  $tmp,$tmp3,0xE\n\t"
5584             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction8D" %}
5585   ins_encode %{
5586     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
5587     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5588     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5589     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
5590     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5591     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5592     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5593     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
5594     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5595     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5596     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5597     __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
5598     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
5599     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
5600     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
5601   %}
5602   ins_pipe( pipe_slow );
5603 %}
5604 
5605 // ====================VECTOR ARITHMETIC=======================================
5606 
5607 // --------------------------------- ADD --------------------------------------
5608 
5609 // Bytes vector add
5610 instruct vadd4B(vecS dst, vecS src) %{
5611   predicate(n->as_Vector()->length() == 4);
5612   match(Set dst (AddVB dst src));
5613   format %{ "paddb   $dst,$src\t! add packed4B" %}
5614   ins_encode %{
5615     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5616   %}
5617   ins_pipe( pipe_slow );
5618 %}
5619 
5620 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5621   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5622   match(Set dst (AddVB src1 src2));
5623   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5624   ins_encode %{
5625     int vector_len = 0;
5626     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5627   %}
5628   ins_pipe( pipe_slow );
5629 %}
5630 
5631 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
5632   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5633   match(Set dst (AddVB src (LoadVector mem)));
5634   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5635   ins_encode %{
5636     int vector_len = 0;
5637     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5638   %}
5639   ins_pipe( pipe_slow );
5640 %}
5641 
5642 instruct vadd8B(vecD dst, vecD src) %{
5643   predicate(n->as_Vector()->length() == 8);
5644   match(Set dst (AddVB dst src));
5645   format %{ "paddb   $dst,$src\t! add packed8B" %}

5646   ins_encode %{
5647     __ paddb($dst$$XMMRegister, $src$$XMMRegister);

5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
5653   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5654   match(Set dst (AddVB src1 src2));
5655   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5656   ins_encode %{
5657     int vector_len = 0;
5658     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5659   %}
5660   ins_pipe( pipe_slow );
5661 %}
5662 
5663 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
5664   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5665   match(Set dst (AddVB src (LoadVector mem)));
5666   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5667   ins_encode %{
5668     int vector_len = 0;
5669     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5670   %}
5671   ins_pipe( pipe_slow );
5672 %}
5673 
5674 instruct vadd16B(vecX dst, vecX src) %{
5675   predicate(n->as_Vector()->length() == 16);












5676   match(Set dst (AddVB dst src));
5677   format %{ "paddb   $dst,$src\t! add packed16B" %}
5678   ins_encode %{
5679     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5680   %}
5681   ins_pipe( pipe_slow );
5682 %}
5683 
5684 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
5685   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5686   match(Set dst (AddVB src1 src2));
5687   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5688   ins_encode %{
5689     int vector_len = 0;
5690     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5691   %}
5692   ins_pipe( pipe_slow );
5693 %}
5694 
5695 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
5696   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5697   match(Set dst (AddVB src (LoadVector mem)));
5698   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5699   ins_encode %{
5700     int vector_len = 0;
5701     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5702   %}
5703   ins_pipe( pipe_slow );
5704 %}
5705 
5706 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
5707   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
5708   match(Set dst (AddVB src1 src2));
5709   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}

5710   ins_encode %{
5711     int vector_len = 1;
5712     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5713   %}
5714   ins_pipe( pipe_slow );
5715 %}
5716 
5717 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
5718   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
5719   match(Set dst (AddVB src (LoadVector mem)));
5720   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5721   ins_encode %{
5722     int vector_len = 1;
5723     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5724   %}
5725   ins_pipe( pipe_slow );
5726 %}
5727 
5728 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5729   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
5730   match(Set dst (AddVB src1 src2));
5731   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5732   ins_encode %{
5733     int vector_len = 2;
5734     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5735   %}
5736   ins_pipe( pipe_slow );
5737 %}
5738 
5739 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5740   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
5741   match(Set dst (AddVB src (LoadVector mem)));
5742   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}

5743   ins_encode %{
5744     int vector_len = 2;
5745     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5746   %}
5747   ins_pipe( pipe_slow );
5748 %}
5749 
5750 // Shorts/Chars vector add
5751 instruct vadd2S(vecS dst, vecS src) %{
5752   predicate(n->as_Vector()->length() == 2);
5753   match(Set dst (AddVS dst src));
5754   format %{ "paddw   $dst,$src\t! add packed2S" %}
5755   ins_encode %{
5756     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5757   %}
5758   ins_pipe( pipe_slow );
5759 %}
5760 
5761 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
5762   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5763   match(Set dst (AddVS src1 src2));
5764   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5765   ins_encode %{
5766     int vector_len = 0;
5767     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5768   %}
5769   ins_pipe( pipe_slow );
5770 %}
5771 
5772 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
5773   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5774   match(Set dst (AddVS src (LoadVector mem)));
5775   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5776   ins_encode %{
5777     int vector_len = 0;
5778     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5779   %}
5780   ins_pipe( pipe_slow );
5781 %}
5782 
5783 instruct vadd4S(vecD dst, vecD src) %{
5784   predicate(n->as_Vector()->length() == 4);
5785   match(Set dst (AddVS dst src));
5786   format %{ "paddw   $dst,$src\t! add packed4S" %}

5787   ins_encode %{
5788     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

5789   %}
5790   ins_pipe( pipe_slow );
5791 %}
5792 
5793 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
5794   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5795   match(Set dst (AddVS src1 src2));
5796   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5797   ins_encode %{
5798     int vector_len = 0;
5799     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5800   %}
5801   ins_pipe( pipe_slow );
5802 %}
5803 
5804 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
5805   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5806   match(Set dst (AddVS src (LoadVector mem)));
5807   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5808   ins_encode %{
5809     int vector_len = 0;
5810     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5811   %}
5812   ins_pipe( pipe_slow );
5813 %}
5814 
5815 instruct vadd8S(vecX dst, vecX src) %{
5816   predicate(n->as_Vector()->length() == 8);
5817   match(Set dst (AddVS dst src));
5818   format %{ "paddw   $dst,$src\t! add packed8S" %}

5819   ins_encode %{
5820     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

5821   %}
5822   ins_pipe( pipe_slow );
5823 %}
5824 
5825 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
5826   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5827   match(Set dst (AddVS src1 src2));
5828   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5829   ins_encode %{
5830     int vector_len = 0;
5831     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
5837   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5838   match(Set dst (AddVS src (LoadVector mem)));
5839   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5840   ins_encode %{
5841     int vector_len = 0;
5842     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5843   %}
5844   ins_pipe( pipe_slow );
5845 %}
5846 
5847 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
5848   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5849   match(Set dst (AddVS src1 src2));
5850   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}

5851   ins_encode %{
5852     int vector_len = 1;
5853     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5854   %}
5855   ins_pipe( pipe_slow );
5856 %}
5857 
5858 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
5859   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5860   match(Set dst (AddVS src (LoadVector mem)));
5861   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
5862   ins_encode %{
5863     int vector_len = 1;
5864     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5865   %}
5866   ins_pipe( pipe_slow );
5867 %}
5868 
5869 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
5870   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
5871   match(Set dst (AddVS src1 src2));
5872   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
5873   ins_encode %{
5874     int vector_len = 2;
5875     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5876   %}
5877   ins_pipe( pipe_slow );
5878 %}
5879 
5880 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
5881   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
5882   match(Set dst (AddVS src (LoadVector mem)));
5883   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}

5884   ins_encode %{
5885     int vector_len = 2;
5886     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 // Integers vector add
5892 instruct vadd2I(vecD dst, vecD src) %{
5893   predicate(n->as_Vector()->length() == 2);
5894   match(Set dst (AddVI dst src));
5895   format %{ "paddd   $dst,$src\t! add packed2I" %}
5896   ins_encode %{
5897     __ paddd($dst$$XMMRegister, $src$$XMMRegister);

5898   %}
5899   ins_pipe( pipe_slow );
5900 %}
5901 
5902 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
5903   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5904   match(Set dst (AddVI src1 src2));
5905   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
5906   ins_encode %{
5907     int vector_len = 0;
5908     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5909   %}
5910   ins_pipe( pipe_slow );
5911 %}
5912 
5913 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
5914   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5915   match(Set dst (AddVI src (LoadVector mem)));
5916   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}

5917   ins_encode %{
5918     int vector_len = 0;
5919     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5920   %}
5921   ins_pipe( pipe_slow );
5922 %}
5923 
5924 instruct vadd4I(vecX dst, vecX src) %{
5925   predicate(n->as_Vector()->length() == 4);
5926   match(Set dst (AddVI dst src));
5927   format %{ "paddd   $dst,$src\t! add packed4I" %}
5928   ins_encode %{
5929     __ paddd($dst$$XMMRegister, $src$$XMMRegister);

5930   %}
5931   ins_pipe( pipe_slow );
5932 %}
5933 
5934 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
5935   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5936   match(Set dst (AddVI src1 src2));
5937   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
5938   ins_encode %{
5939     int vector_len = 0;
5940     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5941   %}
5942   ins_pipe( pipe_slow );
5943 %}
5944 
5945 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
5946   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5947   match(Set dst (AddVI src (LoadVector mem)));
5948   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}

5949   ins_encode %{
5950     int vector_len = 0;
5951     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5952   %}
5953   ins_pipe( pipe_slow );
5954 %}
5955 
5956 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
5957   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5958   match(Set dst (AddVI src1 src2));
5959   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
5960   ins_encode %{
5961     int vector_len = 1;
5962     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5963   %}
5964   ins_pipe( pipe_slow );
5965 %}
5966 
5967 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
5968   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5969   match(Set dst (AddVI src (LoadVector mem)));
5970   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
5971   ins_encode %{
5972     int vector_len = 1;
5973     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5974   %}
5975   ins_pipe( pipe_slow );
5976 %}
5977 
5978 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
5979   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
5980   match(Set dst (AddVI src1 src2));
5981   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}

5982   ins_encode %{
5983     int vector_len = 2;
5984     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5985   %}
5986   ins_pipe( pipe_slow );
5987 %}
5988 
5989 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
5990   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
5991   match(Set dst (AddVI src (LoadVector mem)));
5992   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
5993   ins_encode %{
5994     int vector_len = 2;
5995     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5996   %}
5997   ins_pipe( pipe_slow );
5998 %}
5999 
6000 // Longs vector add
6001 instruct vadd2L(vecX dst, vecX src) %{
6002   predicate(n->as_Vector()->length() == 2);
6003   match(Set dst (AddVL dst src));
6004   format %{ "paddq   $dst,$src\t! add packed2L" %}
6005   ins_encode %{
6006     __ paddq($dst$$XMMRegister, $src$$XMMRegister);

6007   %}
6008   ins_pipe( pipe_slow );
6009 %}
6010 
6011 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6012   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6013   match(Set dst (AddVL src1 src2));
6014   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6015   ins_encode %{
6016     int vector_len = 0;
6017     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6018   %}
6019   ins_pipe( pipe_slow );
6020 %}
6021 
6022 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6023   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6024   match(Set dst (AddVL src (LoadVector mem)));
6025   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}

6026   ins_encode %{
6027     int vector_len = 0;
6028     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6029   %}
6030   ins_pipe( pipe_slow );
6031 %}
6032 
6033 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6034   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6035   match(Set dst (AddVL src1 src2));
6036   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6037   ins_encode %{
6038     int vector_len = 1;
6039     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6040   %}
6041   ins_pipe( pipe_slow );
6042 %}
6043 
6044 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6045   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6046   match(Set dst (AddVL src (LoadVector mem)));
6047   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6048   ins_encode %{
6049     int vector_len = 1;
6050     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6051   %}
6052   ins_pipe( pipe_slow );
6053 %}
6054 
6055 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6056   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6057   match(Set dst (AddVL src1 src2));
6058   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}

6059   ins_encode %{
6060     int vector_len = 2;
6061     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6062   %}
6063   ins_pipe( pipe_slow );
6064 %}
6065 
6066 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6067   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6068   match(Set dst (AddVL src (LoadVector mem)));
6069   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6070   ins_encode %{
6071     int vector_len = 2;
6072     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6073   %}
6074   ins_pipe( pipe_slow );
6075 %}
6076 
6077 // Floats vector add
6078 instruct vadd2F(vecD dst, vecD src) %{
6079   predicate(n->as_Vector()->length() == 2);
6080   match(Set dst (AddVF dst src));
6081   format %{ "addps   $dst,$src\t! add packed2F" %}
6082   ins_encode %{
6083     __ addps($dst$$XMMRegister, $src$$XMMRegister);

6084   %}
6085   ins_pipe( pipe_slow );
6086 %}
6087 
6088 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6089   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6090   match(Set dst (AddVF src1 src2));
6091   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6092   ins_encode %{
6093     int vector_len = 0;
6094     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6095   %}
6096   ins_pipe( pipe_slow );
6097 %}
6098 
6099 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6100   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6101   match(Set dst (AddVF src (LoadVector mem)));
6102   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}

6103   ins_encode %{
6104     int vector_len = 0;
6105     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 instruct vadd4F(vecX dst, vecX src) %{
6111   predicate(n->as_Vector()->length() == 4);
6112   match(Set dst (AddVF dst src));
6113   format %{ "addps   $dst,$src\t! add packed4F" %}
6114   ins_encode %{
6115     __ addps($dst$$XMMRegister, $src$$XMMRegister);

6116   %}
6117   ins_pipe( pipe_slow );
6118 %}
6119 
6120 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6121   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6122   match(Set dst (AddVF src1 src2));
6123   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6124   ins_encode %{
6125     int vector_len = 0;
6126     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6127   %}
6128   ins_pipe( pipe_slow );
6129 %}
6130 
6131 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6132   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6133   match(Set dst (AddVF src (LoadVector mem)));
6134   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}

6135   ins_encode %{
6136     int vector_len = 0;
6137     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6138   %}
6139   ins_pipe( pipe_slow );
6140 %}
6141 
6142 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6143   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6144   match(Set dst (AddVF src1 src2));
6145   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6146   ins_encode %{
6147     int vector_len = 1;
6148     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6149   %}
6150   ins_pipe( pipe_slow );
6151 %}
6152 
6153 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6154   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6155   match(Set dst (AddVF src (LoadVector mem)));
6156   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6157   ins_encode %{
6158     int vector_len = 1;
6159     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6160   %}
6161   ins_pipe( pipe_slow );
6162 %}
6163 
6164 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6165   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6166   match(Set dst (AddVF src1 src2));
6167   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}














































6168   ins_encode %{
6169     int vector_len = 2;
6170     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6171   %}
6172   ins_pipe( pipe_slow );
6173 %}
6174 
6175 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6176   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6177   match(Set dst (AddVF src (LoadVector mem)));
6178   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6179   ins_encode %{
6180     int vector_len = 2;
6181     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6182   %}
6183   ins_pipe( pipe_slow );
6184 %}
6185 
6186 // Doubles vector add
6187 instruct vadd2D(vecX dst, vecX src) %{
6188   predicate(n->as_Vector()->length() == 2);
6189   match(Set dst (AddVD dst src));
6190   format %{ "addpd   $dst,$src\t! add packed2D" %}
6191   ins_encode %{
6192     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}
6196 
6197 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6198   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6199   match(Set dst (AddVD src1 src2));
6200   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6201   ins_encode %{
6202     int vector_len = 0;
6203     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6204   %}
6205   ins_pipe( pipe_slow );
6206 %}
6207 
6208 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6209   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6210   match(Set dst (AddVD src (LoadVector mem)));
6211   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6212   ins_encode %{
6213     int vector_len = 0;
6214     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6215   %}
6216   ins_pipe( pipe_slow );
6217 %}
6218 
6219 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{










6220   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6221   match(Set dst (AddVD src1 src2));
6222   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6223   ins_encode %{
6224     int vector_len = 1;
6225     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6226   %}
6227   ins_pipe( pipe_slow );
6228 %}
6229 
6230 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6231   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6232   match(Set dst (AddVD src (LoadVector mem)));
6233   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}











6234   ins_encode %{
6235     int vector_len = 1;
6236     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6237   %}
6238   ins_pipe( pipe_slow );
6239 %}
6240 
6241 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6242   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6243   match(Set dst (AddVD src1 src2));
6244   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6245   ins_encode %{
6246     int vector_len = 2;
6247     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6248   %}
6249   ins_pipe( pipe_slow );
6250 %}
6251 
6252 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6253   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6254   match(Set dst (AddVD src (LoadVector mem)));
6255   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6256   ins_encode %{
6257     int vector_len = 2;
6258     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6259   %}
6260   ins_pipe( pipe_slow );
6261 %}
6262 
6263 // --------------------------------- SUB --------------------------------------









6264 
6265 // Bytes vector sub
6266 instruct vsub4B(vecS dst, vecS src) %{
6267   predicate(n->as_Vector()->length() == 4);
6268   match(Set dst (SubVB dst src));
6269   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6270   ins_encode %{
6271     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6272   %}
6273   ins_pipe( pipe_slow );
6274 %}
6275 
6276 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6277   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6278   match(Set dst (SubVB src1 src2));
6279   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6280   ins_encode %{
6281     int vector_len = 0;
6282     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6283   %}
6284   ins_pipe( pipe_slow );
6285 %}
6286 
6287 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6288   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6289   match(Set dst (SubVB src (LoadVector mem)));
6290   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6291   ins_encode %{
6292     int vector_len = 0;
6293     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6294   %}
6295   ins_pipe( pipe_slow );
6296 %}
6297 
6298 instruct vsub8B(vecD dst, vecD src) %{
6299   predicate(n->as_Vector()->length() == 8);
6300   match(Set dst (SubVB dst src));
6301   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6302   ins_encode %{
6303     __ psubb($dst$$XMMRegister, $src$$XMMRegister);

6304   %}
6305   ins_pipe( pipe_slow );
6306 %}
6307 
6308 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6309   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6310   match(Set dst (SubVB src1 src2));
6311   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6312   ins_encode %{
6313     int vector_len = 0;
6314     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6315   %}
6316   ins_pipe( pipe_slow );
6317 %}
6318 
6319 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6320   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6321   match(Set dst (SubVB src (LoadVector mem)));
6322   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6323   ins_encode %{
6324     int vector_len = 0;
6325     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);




































































































































































































































































































































































6326   %}
6327   ins_pipe( pipe_slow );
6328 %}
6329 
6330 instruct vsub16B(vecX dst, vecX src) %{
6331   predicate(n->as_Vector()->length() == 16);
6332   match(Set dst (SubVB dst src));
6333   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6334   ins_encode %{
6335     __ psubb($dst$$XMMRegister, $src$$XMMRegister);























































































































































































































































6336   %}
6337   ins_pipe( pipe_slow );
6338 %}
6339 
6340 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6341   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6342   match(Set dst (SubVB src1 src2));
6343   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6344   ins_encode %{
6345     int vector_len = 0;
6346     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6347   %}
6348   ins_pipe( pipe_slow );
6349 %}
6350 
6351 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6352   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6353   match(Set dst (SubVB src (LoadVector mem)));
6354   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6355   ins_encode %{
6356     int vector_len = 0;
6357     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6358   %}
6359   ins_pipe( pipe_slow );
6360 %}
6361 
6362 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6363   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6364   match(Set dst (SubVB src1 src2));
6365   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}

6366   ins_encode %{
6367     int vector_len = 1;
6368     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6369   %}
6370   ins_pipe( pipe_slow );
6371 %}
6372 
6373 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6374   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6375   match(Set dst (SubVB src (LoadVector mem)));
6376   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6377   ins_encode %{
6378     int vector_len = 1;
6379     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6380   %}
6381   ins_pipe( pipe_slow );
6382 %}
6383 
6384 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6385   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
6386   match(Set dst (SubVB src1 src2));
6387   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6388   ins_encode %{
6389     int vector_len = 2;
6390     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6391   %}
6392   ins_pipe( pipe_slow );
6393 %}
6394 
6395 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6396   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
6397   match(Set dst (SubVB src (LoadVector mem)));
6398   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}

6399   ins_encode %{
6400     int vector_len = 2;
6401     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6402   %}
6403   ins_pipe( pipe_slow );
6404 %}
6405 
6406 // Shorts/Chars vector sub
6407 instruct vsub2S(vecS dst, vecS src) %{
6408   predicate(n->as_Vector()->length() == 2);
6409   match(Set dst (SubVS dst src));
6410   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6411   ins_encode %{
6412     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6413   %}
6414   ins_pipe( pipe_slow );
6415 %}
6416 
6417 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6418   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6419   match(Set dst (SubVS src1 src2));
6420   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6421   ins_encode %{
6422     int vector_len = 0;
6423     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6424   %}
6425   ins_pipe( pipe_slow );
6426 %}
6427 
6428 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6429   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6430   match(Set dst (SubVS src (LoadVector mem)));
6431   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6432   ins_encode %{
6433     int vector_len = 0;
6434     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6435   %}
6436   ins_pipe( pipe_slow );
6437 %}
6438 
6439 instruct vsub4S(vecD dst, vecD src) %{
6440   predicate(n->as_Vector()->length() == 4);
6441   match(Set dst (SubVS dst src));
6442   format %{ "psubw   $dst,$src\t! sub packed4S" %}

6443   ins_encode %{
6444     __ psubw($dst$$XMMRegister, $src$$XMMRegister);

6445   %}
6446   ins_pipe( pipe_slow );
6447 %}
6448 
6449 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6450   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6451   match(Set dst (SubVS src1 src2));
6452   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6453   ins_encode %{
6454     int vector_len = 0;
6455     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6456   %}
6457   ins_pipe( pipe_slow );
6458 %}
6459 
6460 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6461   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6462   match(Set dst (SubVS src (LoadVector mem)));
6463   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6464   ins_encode %{
6465     int vector_len = 0;
6466     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6467   %}
6468   ins_pipe( pipe_slow );
6469 %}
6470 
6471 instruct vsub8S(vecX dst, vecX src) %{
6472   predicate(n->as_Vector()->length() == 8);
6473   match(Set dst (SubVS dst src));
6474   format %{ "psubw   $dst,$src\t! sub packed8S" %}

6475   ins_encode %{
6476     __ psubw($dst$$XMMRegister, $src$$XMMRegister);

6477   %}
6478   ins_pipe( pipe_slow );
6479 %}
6480 
6481 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6482   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6483   match(Set dst (SubVS src1 src2));
6484   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6485   ins_encode %{
6486     int vector_len = 0;
6487     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6488   %}
6489   ins_pipe( pipe_slow );
6490 %}
6491 
6492 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6493   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6494   match(Set dst (SubVS src (LoadVector mem)));
6495   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6496   ins_encode %{
6497     int vector_len = 0;
6498     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6499   %}
6500   ins_pipe( pipe_slow );
6501 %}
6502 
6503 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6504   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6505   match(Set dst (SubVS src1 src2));

6506   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6507   ins_encode %{
6508     int vector_len = 1;
6509     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6510   %}
6511   ins_pipe( pipe_slow );
6512 %}
6513 
6514 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6515   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);











6516   match(Set dst (SubVS src (LoadVector mem)));
6517   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6518   ins_encode %{
6519     int vector_len = 1;
6520     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6521   %}
6522   ins_pipe( pipe_slow );
6523 %}
6524 












6525 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6526   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
6527   match(Set dst (SubVS src1 src2));
6528   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6529   ins_encode %{
6530     int vector_len = 2;
6531     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6532   %}
6533   ins_pipe( pipe_slow );
6534 %}
6535 
6536 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6537   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
6538   match(Set dst (SubVS src (LoadVector mem)));
6539   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6540   ins_encode %{
6541     int vector_len = 2;
6542     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6543   %}
6544   ins_pipe( pipe_slow );
6545 %}
6546 
6547 // Integers vector sub
6548 instruct vsub2I(vecD dst, vecD src) %{
6549   predicate(n->as_Vector()->length() == 2);
6550   match(Set dst (SubVI dst src));
6551   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6552   ins_encode %{
6553     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6554   %}
6555   ins_pipe( pipe_slow );
6556 %}
6557 


6866   match(Set dst (SubVD src (LoadVector mem)));
6867   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
6868   ins_encode %{
6869     int vector_len = 0;
6870     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6871   %}
6872   ins_pipe( pipe_slow );
6873 %}
6874 
6875 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
6876   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6877   match(Set dst (SubVD src1 src2));
6878   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
6879   ins_encode %{
6880     int vector_len = 1;
6881     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6882   %}
6883   ins_pipe( pipe_slow );
6884 %}
6885 
6886 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
6887   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6888   match(Set dst (SubVD src (LoadVector mem)));
6889   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}






























































































































































6890   ins_encode %{
6891     int vector_len = 1;
6892     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6893   %}
6894   ins_pipe( pipe_slow );
6895 %}
6896 
6897 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6898   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6899   match(Set dst (SubVD src1 src2));
6900   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
6901   ins_encode %{
6902     int vector_len = 2;
6903     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6904   %}
6905   ins_pipe( pipe_slow );
6906 %}
6907 
6908 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
6909   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6910   match(Set dst (SubVD src (LoadVector mem)));
6911   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}

6912   ins_encode %{
6913     int vector_len = 2;
6914     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6915   %}
6916   ins_pipe( pipe_slow );
6917 %}
6918 
6919 // --------------------------------- MUL --------------------------------------
6920 
6921 // Shorts/Chars vector mul
6922 instruct vmul2S(vecS dst, vecS src) %{
6923   predicate(n->as_Vector()->length() == 2);
6924   match(Set dst (MulVS dst src));
6925   format %{ "pmullw $dst,$src\t! mul packed2S" %}
6926   ins_encode %{
6927     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6928   %}
6929   ins_pipe( pipe_slow );
6930 %}
6931 
6932 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
6933   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6934   match(Set dst (MulVS src1 src2));
6935   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
6936   ins_encode %{
6937     int vector_len = 0;
6938     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6939   %}
6940   ins_pipe( pipe_slow );
6941 %}
6942 
6943 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
6944   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6945   match(Set dst (MulVS src (LoadVector mem)));
6946   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
6947   ins_encode %{
6948     int vector_len = 0;
6949     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6950   %}
6951   ins_pipe( pipe_slow );
6952 %}
6953 
6954 instruct vmul4S(vecD dst, vecD src) %{
6955   predicate(n->as_Vector()->length() == 4);
6956   match(Set dst (MulVS dst src));
6957   format %{ "pmullw  $dst,$src\t! mul packed4S" %}

6958   ins_encode %{
6959     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);

6960   %}
6961   ins_pipe( pipe_slow );
6962 %}
6963 
6964 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
6965   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6966   match(Set dst (MulVS src1 src2));
6967   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
6968   ins_encode %{
6969     int vector_len = 0;
6970     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6971   %}
6972   ins_pipe( pipe_slow );
6973 %}
6974 
6975 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
6976   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6977   match(Set dst (MulVS src (LoadVector mem)));
6978   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
6979   ins_encode %{
6980     int vector_len = 0;
6981     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6982   %}
6983   ins_pipe( pipe_slow );
6984 %}
6985 
6986 instruct vmul8S(vecX dst, vecX src) %{
6987   predicate(n->as_Vector()->length() == 8);
6988   match(Set dst (MulVS dst src));
6989   format %{ "pmullw  $dst,$src\t! mul packed8S" %}

6990   ins_encode %{
6991     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);

6992   %}
6993   ins_pipe( pipe_slow );
6994 %}
6995 
6996 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
6997   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6998   match(Set dst (MulVS src1 src2));
6999   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7000   ins_encode %{
7001     int vector_len = 0;
7002     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7003   %}
7004   ins_pipe( pipe_slow );
7005 %}
7006 
7007 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7008   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7009   match(Set dst (MulVS src (LoadVector mem)));
7010   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7011   ins_encode %{
7012     int vector_len = 0;
7013     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7014   %}
7015   ins_pipe( pipe_slow );
7016 %}
7017 
7018 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7019   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7020   match(Set dst (MulVS src1 src2));

7021   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7022   ins_encode %{
7023     int vector_len = 1;
7024     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7025   %}
7026   ins_pipe( pipe_slow );
7027 %}
7028 
7029 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7030   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);











7031   match(Set dst (MulVS src (LoadVector mem)));
7032   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7033   ins_encode %{
7034     int vector_len = 1;
7035     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7036   %}
7037   ins_pipe( pipe_slow );
7038 %}
7039 












7040 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7041   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7042   match(Set dst (MulVS src1 src2));
7043   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7044   ins_encode %{
7045     int vector_len = 2;
7046     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7047   %}
7048   ins_pipe( pipe_slow );
7049 %}
7050 
7051 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7052   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7053   match(Set dst (MulVS src (LoadVector mem)));
7054   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7055   ins_encode %{
7056     int vector_len = 2;
7057     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7058   %}
7059   ins_pipe( pipe_slow );
7060 %}
7061 
7062 // Integers vector mul (sse4_1)
7063 instruct vmul2I(vecD dst, vecD src) %{
7064   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7065   match(Set dst (MulVI dst src));
7066   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7067   ins_encode %{
7068     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7069   %}
7070   ins_pipe( pipe_slow );
7071 %}
7072 


7662   %}
7663   ins_pipe( pipe_slow );
7664 %}
7665 
7666 instruct vsqrt4D_reg(vecY dst, vecY src) %{
7667   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7668   match(Set dst (SqrtVD src));
7669   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
7670   ins_encode %{
7671     int vector_len = 1;
7672     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
7673   %}
7674   ins_pipe( pipe_slow );
7675 %}
7676 
7677 instruct vsqrt4D_mem(vecY dst, memory mem) %{
7678   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7679   match(Set dst (SqrtVD (LoadVector mem)));
7680   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
7681   ins_encode %{
7682     int vector_len = 1;
7683     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);


















































































































































































7684   %}
7685   ins_pipe( pipe_slow );
7686 %}
7687 
7688 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
7689   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7690   match(Set dst (SqrtVD src));
7691   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
7692   ins_encode %{
7693     int vector_len = 2;
7694     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
7695   %}
7696   ins_pipe( pipe_slow );
7697 %}
7698 
7699 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
7700   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7701   match(Set dst (SqrtVD (LoadVector mem)));
7702   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}

7703   ins_encode %{
7704     int vector_len = 2;
7705     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
7706   %}
7707   ins_pipe( pipe_slow );
7708 %}
7709 
7710 // ------------------------------ LeftShift -----------------------------------
7711 
7712 // Shorts/Chars vector left shift
7713 instruct vsll2S(vecS dst, vecS shift) %{
7714   predicate(n->as_Vector()->length() == 2);
7715   match(Set dst (LShiftVS dst shift));
7716   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
7717   ins_encode %{
7718     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
7719   %}
7720   ins_pipe( pipe_slow );
7721 %}
7722 
7723 instruct vsll2S_imm(vecS dst, immI8 shift) %{
7724   predicate(n->as_Vector()->length() == 2);
7725   match(Set dst (LShiftVS dst shift));
7726   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
7727   ins_encode %{
7728     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
7729   %}
7730   ins_pipe( pipe_slow );
7731 %}
7732 
7733 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
7734   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7735   match(Set dst (LShiftVS src shift));
7736   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
7737   ins_encode %{
7738     int vector_len = 0;
7739     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7740   %}
7741   ins_pipe( pipe_slow );
7742 %}
7743 
7744 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
7745   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7746   match(Set dst (LShiftVS src shift));
7747   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
7748   ins_encode %{
7749     int vector_len = 0;
7750     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7751   %}
7752   ins_pipe( pipe_slow );
7753 %}
7754 
7755 instruct vsll4S(vecD dst, vecS shift) %{
7756   predicate(n->as_Vector()->length() == 4);
7757   match(Set dst (LShiftVS dst shift));
7758   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}

7759   ins_encode %{
7760     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);

7761   %}
7762   ins_pipe( pipe_slow );
7763 %}
7764 
7765 instruct vsll4S_imm(vecD dst, immI8 shift) %{
7766   predicate(n->as_Vector()->length() == 4);
7767   match(Set dst (LShiftVS dst shift));
7768   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
7769   ins_encode %{
7770     __ psllw($dst$$XMMRegister, (int)$shift$$constant);

7771   %}
7772   ins_pipe( pipe_slow );
7773 %}
7774 
7775 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
7776   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7777   match(Set dst (LShiftVS src shift));
7778   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
7779   ins_encode %{
7780     int vector_len = 0;
7781     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7782   %}
7783   ins_pipe( pipe_slow );
7784 %}
7785 
7786 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
7787   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7788   match(Set dst (LShiftVS src shift));
7789   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}

7790   ins_encode %{
7791     int vector_len = 0;
7792     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7793   %}
7794   ins_pipe( pipe_slow );
7795 %}
7796 
7797 instruct vsll8S(vecX dst, vecS shift) %{
7798   predicate(n->as_Vector()->length() == 8);
7799   match(Set dst (LShiftVS dst shift));
7800   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
7801   ins_encode %{
7802     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);

7803   %}
7804   ins_pipe( pipe_slow );
7805 %}
7806 
7807 instruct vsll8S_imm(vecX dst, immI8 shift) %{
7808   predicate(n->as_Vector()->length() == 8);
7809   match(Set dst (LShiftVS dst shift));
7810   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
7811   ins_encode %{
7812     __ psllw($dst$$XMMRegister, (int)$shift$$constant);

7813   %}
7814   ins_pipe( pipe_slow );
7815 %}
7816 
7817 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
7818   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7819   match(Set dst (LShiftVS src shift));
7820   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}

7821   ins_encode %{
7822     int vector_len = 0;
7823     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7824   %}
7825   ins_pipe( pipe_slow );
7826 %}
7827 
7828 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
7829   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7830   match(Set dst (LShiftVS src shift));
7831   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
7832   ins_encode %{
7833     int vector_len = 0;
7834     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7835   %}
7836   ins_pipe( pipe_slow );
7837 %}
7838 
7839 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
7840   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7841   match(Set dst (LShiftVS src shift));
7842   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
7843   ins_encode %{
7844     int vector_len = 1;
7845     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7846   %}
7847   ins_pipe( pipe_slow );
7848 %}
7849 
7850 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
7851   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7852   match(Set dst (LShiftVS src shift));

7853   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
7854   ins_encode %{
7855     int vector_len = 1;
7856     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7857   %}
7858   ins_pipe( pipe_slow );
7859 %}
7860 
7861 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
7862   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7863   match(Set dst (LShiftVS src shift));
7864   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
7865   ins_encode %{
7866     int vector_len = 2;
7867     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
7868   %}
7869   ins_pipe( pipe_slow );
7870 %}
7871 
7872 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
7873   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
7874   match(Set dst (LShiftVS src shift));
7875   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
7876   ins_encode %{
7877     int vector_len = 2;
7878     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
7879   %}
7880   ins_pipe( pipe_slow );
7881 %}
7882 
7883 // Integers vector left shift
7884 instruct vsll2I(vecD dst, vecS shift) %{
7885   predicate(n->as_Vector()->length() == 2);
7886   match(Set dst (LShiftVI dst shift));
7887   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
7888   ins_encode %{
7889     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
7890   %}
7891   ins_pipe( pipe_slow );
7892 %}
7893 


8062   %}
8063   ins_pipe( pipe_slow );
8064 %}
8065 
8066 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8067   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8068   match(Set dst (LShiftVL src shift));
8069   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8070   ins_encode %{
8071     int vector_len = 1;
8072     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8073   %}
8074   ins_pipe( pipe_slow );
8075 %}
8076 
8077 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8078   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8079   match(Set dst (LShiftVL src shift));
8080   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8081   ins_encode %{
8082     int vector_len = 2;
8083     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);






















































































































































































8084   %}
8085   ins_pipe( pipe_slow );
8086 %}
8087 
8088 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8089   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8090   match(Set dst (LShiftVL src shift));
8091   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}

8092   ins_encode %{
8093     int vector_len = 2;
8094     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8095   %}
8096   ins_pipe( pipe_slow );
8097 %}
8098 
8099 // ----------------------- LogicalRightShift -----------------------------------
8100 
8101 // Shorts vector logical right shift produces incorrect Java result
8102 // for negative data because java code convert short value into int with
8103 // sign extension before a shift. But char vectors are fine since chars are
8104 // unsigned values.
8105 
8106 instruct vsrl2S(vecS dst, vecS shift) %{
8107   predicate(n->as_Vector()->length() == 2);
8108   match(Set dst (URShiftVS dst shift));
8109   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8110   ins_encode %{
8111     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8112   %}
8113   ins_pipe( pipe_slow );
8114 %}
8115 
8116 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8117   predicate(n->as_Vector()->length() == 2);
8118   match(Set dst (URShiftVS dst shift));
8119   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8120   ins_encode %{
8121     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8122   %}
8123   ins_pipe( pipe_slow );
8124 %}
8125 
8126 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8127   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8128   match(Set dst (URShiftVS src shift));
8129   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8130   ins_encode %{
8131     int vector_len = 0;
8132     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8133   %}
8134   ins_pipe( pipe_slow );
8135 %}
8136 
8137 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8138   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8139   match(Set dst (URShiftVS src shift));
8140   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8141   ins_encode %{
8142     int vector_len = 0;
8143     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8144   %}
8145   ins_pipe( pipe_slow );
8146 %}
8147 
8148 instruct vsrl4S(vecD dst, vecS shift) %{
8149   predicate(n->as_Vector()->length() == 4);
8150   match(Set dst (URShiftVS dst shift));
8151   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}

8152   ins_encode %{
8153     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);

8154   %}
8155   ins_pipe( pipe_slow );
8156 %}
8157 
8158 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8159   predicate(n->as_Vector()->length() == 4);
8160   match(Set dst (URShiftVS dst shift));
8161   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8162   ins_encode %{
8163     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);

8164   %}
8165   ins_pipe( pipe_slow );
8166 %}
8167 
8168 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8169   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8170   match(Set dst (URShiftVS src shift));
8171   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8172   ins_encode %{
8173     int vector_len = 0;
8174     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8175   %}
8176   ins_pipe( pipe_slow );
8177 %}
8178 
8179 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8180   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8181   match(Set dst (URShiftVS src shift));
8182   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}

8183   ins_encode %{
8184     int vector_len = 0;
8185     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8186   %}
8187   ins_pipe( pipe_slow );
8188 %}
8189 
8190 instruct vsrl8S(vecX dst, vecS shift) %{
8191   predicate(n->as_Vector()->length() == 8);
8192   match(Set dst (URShiftVS dst shift));
8193   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8194   ins_encode %{
8195     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);

8196   %}
8197   ins_pipe( pipe_slow );
8198 %}
8199 
8200 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8201   predicate(n->as_Vector()->length() == 8);
8202   match(Set dst (URShiftVS dst shift));
8203   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8204   ins_encode %{
8205     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);

8206   %}
8207   ins_pipe( pipe_slow );
8208 %}
8209 
8210 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8211   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8212   match(Set dst (URShiftVS src shift));
8213   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}

8214   ins_encode %{
8215     int vector_len = 0;
8216     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8217   %}
8218   ins_pipe( pipe_slow );
8219 %}
8220 
8221 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8222   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8223   match(Set dst (URShiftVS src shift));
8224   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8225   ins_encode %{
8226     int vector_len = 0;
8227     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8228   %}
8229   ins_pipe( pipe_slow );
8230 %}
8231 
8232 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8233   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8234   match(Set dst (URShiftVS src shift));
8235   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8236   ins_encode %{
8237     int vector_len = 1;
8238     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8239   %}
8240   ins_pipe( pipe_slow );
8241 %}
8242 
8243 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8244   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8245   match(Set dst (URShiftVS src shift));

8246   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8247   ins_encode %{
8248     int vector_len = 1;
8249     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8250   %}
8251   ins_pipe( pipe_slow );
8252 %}
8253 
8254 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8255   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8256   match(Set dst (URShiftVS src shift));
8257   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8258   ins_encode %{
8259     int vector_len = 2;
8260     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8261   %}
8262   ins_pipe( pipe_slow );
8263 %}
8264 
8265 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8266   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8267   match(Set dst (URShiftVS src shift));
8268   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8269   ins_encode %{
8270     int vector_len = 2;
8271     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8272   %}
8273   ins_pipe( pipe_slow );
8274 %}
8275 
8276 // Integers vector logical right shift
8277 instruct vsrl2I(vecD dst, vecS shift) %{
8278   predicate(n->as_Vector()->length() == 2);
8279   match(Set dst (URShiftVI dst shift));
8280   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8281   ins_encode %{
8282     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8283   %}
8284   ins_pipe( pipe_slow );
8285 %}
8286 


8476     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8477   %}
8478   ins_pipe( pipe_slow );
8479 %}
8480 
8481 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8482   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8483   match(Set dst (URShiftVL src shift));
8484   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8485   ins_encode %{
8486     int vector_len = 2;
8487     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8488   %}
8489   ins_pipe( pipe_slow );
8490 %}
8491 
8492 // ------------------- ArithmeticRightShift -----------------------------------
8493 
8494 // Shorts/Chars vector arithmetic right shift
8495 instruct vsra2S(vecS dst, vecS shift) %{
8496   predicate(n->as_Vector()->length() == 2);
8497   match(Set dst (RShiftVS dst shift));
8498   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8499   ins_encode %{
8500     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8501   %}
8502   ins_pipe( pipe_slow );
8503 %}
8504 
8505 instruct vsra2S_imm(vecS dst, immI8 shift) %{
8506   predicate(n->as_Vector()->length() == 2);
8507   match(Set dst (RShiftVS dst shift));
8508   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8509   ins_encode %{
8510     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8511   %}
8512   ins_pipe( pipe_slow );
8513 %}
8514 
8515 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
8516   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8517   match(Set dst (RShiftVS src shift));
8518   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
8519   ins_encode %{
8520     int vector_len = 0;
8521     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8522   %}
8523   ins_pipe( pipe_slow );
8524 %}
8525 
8526 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8527   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);


































8528   match(Set dst (RShiftVS src shift));
8529   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
8530   ins_encode %{
8531     int vector_len = 0;
8532     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8533   %}
8534   ins_pipe( pipe_slow );
8535 %}
8536 












8537 instruct vsra4S(vecD dst, vecS shift) %{
8538   predicate(n->as_Vector()->length() == 4);
8539   match(Set dst (RShiftVS dst shift));
8540   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
8541   ins_encode %{
8542     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8543   %}
8544   ins_pipe( pipe_slow );
8545 %}
8546 
8547 instruct vsra4S_imm(vecD dst, immI8 shift) %{
8548   predicate(n->as_Vector()->length() == 4);
8549   match(Set dst (RShiftVS dst shift));
8550   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
8551   ins_encode %{
8552     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8553   %}
8554   ins_pipe( pipe_slow );
8555 %}
8556 
8557 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
8558   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8559   match(Set dst (RShiftVS src shift));
8560   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
8561   ins_encode %{
8562     int vector_len = 0;
8563     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8564   %}
8565   ins_pipe( pipe_slow );
8566 %}
8567 
8568 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8569   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);


































8570   match(Set dst (RShiftVS src shift));
8571   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
8572   ins_encode %{
8573     int vector_len = 0;
8574     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8575   %}
8576   ins_pipe( pipe_slow );
8577 %}
8578 












8579 instruct vsra8S(vecX dst, vecS shift) %{
8580   predicate(n->as_Vector()->length() == 8);
8581   match(Set dst (RShiftVS dst shift));
8582   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
8583   ins_encode %{
8584     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8585   %}
8586   ins_pipe( pipe_slow );
8587 %}
8588 
8589 instruct vsra8S_imm(vecX dst, immI8 shift) %{
8590   predicate(n->as_Vector()->length() == 8);
8591   match(Set dst (RShiftVS dst shift));
8592   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
8593   ins_encode %{
8594     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8595   %}
8596   ins_pipe( pipe_slow );
8597 %}
8598 
8599 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
8600   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8601   match(Set dst (RShiftVS src shift));
8602   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
8603   ins_encode %{
8604     int vector_len = 0;
8605     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8606   %}
8607   ins_pipe( pipe_slow );
8608 %}
8609 
8610 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8611   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);


































8612   match(Set dst (RShiftVS src shift));
8613   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
8614   ins_encode %{
8615     int vector_len = 0;
8616     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8617   %}
8618   ins_pipe( pipe_slow );
8619 %}
8620 
8621 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
8622   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);























8623   match(Set dst (RShiftVS src shift));
8624   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
8625   ins_encode %{
8626     int vector_len = 1;
8627     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8628   %}
8629   ins_pipe( pipe_slow );
8630 %}
8631 
8632 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8633   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);























8634   match(Set dst (RShiftVS src shift));
8635   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
8636   ins_encode %{
8637     int vector_len = 1;
8638     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8639   %}
8640   ins_pipe( pipe_slow );
8641 %}
8642 












8643 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
8644   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8645   match(Set dst (RShiftVS src shift));
8646   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
8647   ins_encode %{
8648     int vector_len = 2;
8649     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8650   %}
8651   ins_pipe( pipe_slow );
8652 %}
8653 
8654 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8655   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
8656   match(Set dst (RShiftVS src shift));
8657   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
8658   ins_encode %{
8659     int vector_len = 2;
8660     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8661   %}
8662   ins_pipe( pipe_slow );
8663 %}
8664 
8665 // Integers vector arithmetic right shift
8666 instruct vsra2I(vecD dst, vecS shift) %{
8667   predicate(n->as_Vector()->length() == 2);
8668   match(Set dst (RShiftVI dst shift));
8669   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
8670   ins_encode %{
8671     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
8672   %}
8673   ins_pipe( pipe_slow );
8674 %}
8675 




1699     case Op_SqrtVD:
1700       if (UseAVX < 1) // enabled for AVX only
1701         ret_value = false;
1702       break;
1703     case Op_CompareAndSwapL:
1704 #ifdef _LP64
1705     case Op_CompareAndSwapP:
1706 #endif
1707       if (!VM_Version::supports_cx8())
1708         ret_value = false;
1709       break;
1710     case Op_CMoveVD:
1711       if (UseAVX < 1 || UseAVX > 2)
1712         ret_value = false;
1713       break;
1714   }
1715 
1716   return ret_value;  // Per default match rules are supported.
1717 }
1718 
1719 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1720   // identify extra cases that we might want to provide match rules for
1721   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1722   bool ret_value = match_rule_supported(opcode);
1723   if (ret_value) {
1724     switch (opcode) {
1725       case Op_AddVB:
1726       case Op_SubVB:
1727         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1728           ret_value = false;
1729         break;
1730       case Op_URShiftVS:
1731       case Op_RShiftVS:
1732       case Op_LShiftVS:
1733       case Op_MulVS:
1734       case Op_AddVS:
1735       case Op_SubVS:
1736         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1737           ret_value = false;
1738         break;
1739       case Op_CMoveVD:
1740         if (vlen != 4)
1741           ret_value  = false;
1742         break;
1743     }
1744   }
1745 
1746   return ret_value;  // Per default match rules are supported.
1747 }
1748 
1749 const int Matcher::float_pressure(int default_pressure_threshold) {
1750   int float_pressure_threshold = default_pressure_threshold;
1751 #ifdef _LP64
1752   if (UseAVX > 2) {
1753     // Increase pressure threshold on machines with AVX3 which have
1754     // 2x more XMM registers.
1755     float_pressure_threshold = default_pressure_threshold * 2;
1756   }
1757 #endif
1758   return float_pressure_threshold;
1759 }
1760 
1761 // Max vector size in bytes. 0 if not supported.
1762 const int Matcher::vector_width_in_bytes(BasicType bt) {
1763   assert(is_java_primitive(bt), "only primitive type vectors");
1764   if (UseSSE < 2) return 0;
1765   // SSE2 supports 128bit vectors for all types.
1766   // AVX2 supports 256bit vectors for all types.
1767   // AVX2/EVEX supports 512bit vectors for all types.
1768   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;


1772   // Use flag to limit vector size.
1773   size = MIN2(size,(int)MaxVectorSize);
1774   // Minimum 2 values in vector (or 4 for bytes).
1775   switch (bt) {
1776   case T_DOUBLE:
1777   case T_LONG:
1778     if (size < 16) return 0;
1779     break;
1780   case T_FLOAT:
1781   case T_INT:
1782     if (size < 8) return 0;
1783     break;
1784   case T_BOOLEAN:
1785     if (size < 4) return 0;
1786     break;
1787   case T_CHAR:
1788     if (size < 4) return 0;
1789     break;
1790   case T_BYTE:
1791     if (size < 4) return 0;

1792     break;
1793   case T_SHORT:
1794     if (size < 4) return 0;

1795     break;
1796   default:
1797     ShouldNotReachHere();
1798   }
1799   return size;
1800 }
1801 
1802 // Limits on vector size (number of elements) loaded into vector.
1803 const int Matcher::max_vector_size(const BasicType bt) {
1804   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1805 }
1806 const int Matcher::min_vector_size(const BasicType bt) {
1807   int max_size = max_vector_size(bt);
1808   // Min size which can be loaded into vector is 4 bytes.
1809   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1810   return MIN2(size,max_size);
1811 }
1812 
1813 // Vector ideal reg corresponding to specidied size in bytes
1814 const int Matcher::vector_ideal_reg(int size) {


1978         break;
1979       case Op_VecD:
1980         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1981         break;
1982        case Op_VecX:
1983         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1984         break;
1985       case Op_VecY:
1986       case Op_VecZ:
1987         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1988         break;
1989       default:
1990         ShouldNotReachHere();
1991       }
1992     }
1993 #endif
1994   }
1995   bool is_single_byte = false;
1996   int vec_len = 0;
1997   if ((UseAVX > 2) && (stack_offset != 0)) {
1998     int tuple_type = Assembler::EVEX_FVM;
1999     int input_size = Assembler::EVEX_32bit;
2000     switch (ireg) {
2001     case Op_VecS:
2002       tuple_type = Assembler::EVEX_T1S;
2003       break;
2004     case Op_VecD:
2005       tuple_type = Assembler::EVEX_T1S;
2006       input_size = Assembler::EVEX_64bit;
2007       break;
2008     case Op_VecX:
2009       break;
2010     case Op_VecY:
2011       vec_len = 1;
2012       break;
2013     case Op_VecZ:
2014       vec_len = 2;
2015       break;
2016     }
2017     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
2018   }
2019   int offset_size = 0;
2020   int size = 5;
2021   if (UseAVX > 2 ) {
2022     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
2023       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2024       size += 2; // Need an additional two bytes for EVEX encoding
2025     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
2026       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2027     } else {
2028       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2029       size += 2; // Need an additional two bytes for EVEX encodding
2030     }
2031   } else {
2032     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2033   }
2034   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2035   return size+offset_size;
2036 }
2037 
2038 static inline jfloat replicate4_imm(int con, int width) {
2039   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2040   assert(width == 1 || width == 2, "only byte or short types here");
2041   int bit_width = width * 8;
2042   jint val = con;
2043   val &= (1 << bit_width) - 1;  // mask off sign bits
2044   while(bit_width < 32) {
2045     val |= (val << bit_width);


2729   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2730   ins_cost(150);
2731   ins_encode %{
2732     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2733   %}
2734   ins_pipe(pipe_slow);
2735 %}
2736 
2737 instruct absF_reg(regF dst) %{
2738   predicate((UseSSE>=1) && (UseAVX == 0));
2739   match(Set dst (AbsF dst));
2740   ins_cost(150);
2741   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2742   ins_encode %{
2743     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2744   %}
2745   ins_pipe(pipe_slow);
2746 %}
2747 
2748 instruct absF_reg_reg(regF dst, regF src) %{
2749   predicate(VM_Version::supports_avxonly());
2750   match(Set dst (AbsF src));
2751   ins_cost(150);
2752   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2753   ins_encode %{
2754     int vector_len = 0;
2755     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2756               ExternalAddress(float_signmask()), vector_len);
2757   %}
2758   ins_pipe(pipe_slow);
2759 %}
2760 
2761 #ifdef _LP64
2762 instruct absF_reg_reg_evex(regF dst, regF src) %{
2763   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2764   match(Set dst (AbsF src));
2765   ins_cost(150);
2766   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2767   ins_encode %{
2768     int vector_len = 0;
2769     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2770               ExternalAddress(float_signmask()), vector_len);
2771   %}
2772   ins_pipe(pipe_slow);
2773 %}
2774 
2775 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2776   predicate(VM_Version::supports_avx512novl());
2777   match(Set dst (AbsF src1));
2778   effect(TEMP src2);
2779   ins_cost(150);
2780   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2781   ins_encode %{
2782     int vector_len = 0;
2783     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2784               ExternalAddress(float_signmask()), vector_len);
2785   %}
2786   ins_pipe(pipe_slow);
2787 %}
2788 #else // _LP64
2789 instruct absF_reg_reg_evex(regF dst, regF src) %{
2790   predicate(UseAVX > 2);
2791   match(Set dst (AbsF src));
2792   ins_cost(150);
2793   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2794   ins_encode %{
2795     int vector_len = 0;
2796     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2797               ExternalAddress(float_signmask()), vector_len);
2798   %}
2799   ins_pipe(pipe_slow);
2800 %}
2801 #endif
2802 
2803 instruct absD_reg(regD dst) %{
2804   predicate((UseSSE>=2) && (UseAVX == 0));
2805   match(Set dst (AbsD dst));
2806   ins_cost(150);
2807   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2808             "# abs double by sign masking" %}
2809   ins_encode %{
2810     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2811   %}
2812   ins_pipe(pipe_slow);
2813 %}
2814 
2815 instruct absD_reg_reg(regD dst, regD src) %{
2816   predicate(VM_Version::supports_avxonly());
2817   match(Set dst (AbsD src));
2818   ins_cost(150);
2819   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2820             "# abs double by sign masking" %}
2821   ins_encode %{
2822     int vector_len = 0;
2823     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2824               ExternalAddress(double_signmask()), vector_len);
2825   %}
2826   ins_pipe(pipe_slow);
2827 %}
2828 
2829 #ifdef _LP64
2830 instruct absD_reg_reg_evex(regD dst, regD src) %{
2831   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2832   match(Set dst (AbsD src));
2833   ins_cost(150);
2834   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2835             "# abs double by sign masking" %}
2836   ins_encode %{
2837     int vector_len = 0;
2838     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2839               ExternalAddress(double_signmask()), vector_len);
2840   %}
2841   ins_pipe(pipe_slow);
2842 %}
2843 
2844 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2845   predicate(VM_Version::supports_avx512novl());
2846   match(Set dst (AbsD src1));
2847   effect(TEMP src2);
2848   ins_cost(150);
2849   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2850   ins_encode %{
2851     int vector_len = 0;
2852     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2853               ExternalAddress(double_signmask()), vector_len);
2854   %}
2855   ins_pipe(pipe_slow);
2856 %}
2857 #else // _LP64
2858 instruct absD_reg_reg_evex(regD dst, regD src) %{
2859   predicate(UseAVX > 2);
2860   match(Set dst (AbsD src));
2861   ins_cost(150);
2862   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2863             "# abs double by sign masking" %}
2864   ins_encode %{
2865     int vector_len = 0;
2866     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2867               ExternalAddress(double_signmask()), vector_len);
2868   %}
2869   ins_pipe(pipe_slow);
2870 %}
2871 #endif
2872 
2873 instruct negF_reg(regF dst) %{
2874   predicate((UseSSE>=1) && (UseAVX == 0));
2875   match(Set dst (NegF dst));
2876   ins_cost(150);
2877   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2878   ins_encode %{
2879     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2880   %}
2881   ins_pipe(pipe_slow);
2882 %}
2883 
2884 instruct negF_reg_reg(regF dst, regF src) %{
2885   predicate(UseAVX > 0);
2886   match(Set dst (NegF src));
2887   ins_cost(150);
2888   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2889   ins_encode %{
2890     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2891                  ExternalAddress(float_signflip()));


4658 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4659   predicate(UseSSE > 2 && UseAVX == 0);
4660   match(Set dst (AddReductionVI src1 src2));
4661   effect(TEMP tmp2, TEMP tmp);
4662   format %{ "movdqu  $tmp2,$src2\n\t"
4663             "phaddd  $tmp2,$tmp2\n\t"
4664             "movd    $tmp,$src1\n\t"
4665             "paddd   $tmp,$tmp2\n\t"
4666             "movd    $dst,$tmp\t! add reduction2I" %}
4667   ins_encode %{
4668     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4669     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4670     __ movdl($tmp$$XMMRegister, $src1$$Register);
4671     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4672     __ movdl($dst$$Register, $tmp$$XMMRegister);
4673   %}
4674   ins_pipe( pipe_slow );
4675 %}
4676 
4677 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4678   predicate(VM_Version::supports_avxonly());
4679   match(Set dst (AddReductionVI src1 src2));
4680   effect(TEMP tmp, TEMP tmp2);
4681   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4682             "movd     $tmp2,$src1\n\t"
4683             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4684             "movd     $dst,$tmp2\t! add reduction2I" %}
4685   ins_encode %{
4686     int vector_len = 0;
4687     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4688     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4689     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4690     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4691   %}
4692   ins_pipe( pipe_slow );
4693 %}
4694 
4695 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4696   predicate(UseAVX > 2);
4697   match(Set dst (AddReductionVI src1 src2));
4698   effect(TEMP tmp, TEMP tmp2);
4699   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4700             "vpaddd  $tmp,$src2,$tmp2\n\t"
4701             "movd    $tmp2,$src1\n\t"
4702             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4703             "movd    $dst,$tmp2\t! add reduction2I" %}
4704   ins_encode %{
4705     int vector_len = 0;
4706     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4707     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4708     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4709     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4710     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4711   %}
4712   ins_pipe( pipe_slow );
4713 %}
4714 
4715 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4716   predicate(UseSSE > 2 && UseAVX == 0);
4717   match(Set dst (AddReductionVI src1 src2));
4718   effect(TEMP tmp, TEMP tmp2);
4719   format %{ "movdqu  $tmp,$src2\n\t"
4720             "phaddd  $tmp,$tmp\n\t"
4721             "phaddd  $tmp,$tmp\n\t"
4722             "movd    $tmp2,$src1\n\t"
4723             "paddd   $tmp2,$tmp\n\t"
4724             "movd    $dst,$tmp2\t! add reduction4I" %}
4725   ins_encode %{
4726     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4727     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4728     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4729     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4730     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4731     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4732   %}
4733   ins_pipe( pipe_slow );
4734 %}
4735 
4736 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4737   predicate(VM_Version::supports_avxonly());
4738   match(Set dst (AddReductionVI src1 src2));
4739   effect(TEMP tmp, TEMP tmp2);
4740   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4741             "vphaddd  $tmp,$tmp,$tmp\n\t"
4742             "movd     $tmp2,$src1\n\t"
4743             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4744             "movd     $dst,$tmp2\t! add reduction4I" %}
4745   ins_encode %{
4746     int vector_len = 0;
4747     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4748     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4749     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4750     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4751     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4752   %}
4753   ins_pipe( pipe_slow );
4754 %}
4755 
4756 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4757   predicate(UseAVX > 2);
4758   match(Set dst (AddReductionVI src1 src2));
4759   effect(TEMP tmp, TEMP tmp2);
4760   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4761             "vpaddd  $tmp,$src2,$tmp2\n\t"
4762             "pshufd  $tmp2,$tmp,0x1\n\t"
4763             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4764             "movd    $tmp2,$src1\n\t"
4765             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4766             "movd    $dst,$tmp2\t! add reduction4I" %}
4767   ins_encode %{
4768     int vector_len = 0;
4769     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4770     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4771     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4772     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4773     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4774     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4775     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4776   %}
4777   ins_pipe( pipe_slow );
4778 %}
4779 
4780 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4781   predicate(VM_Version::supports_avxonly());
4782   match(Set dst (AddReductionVI src1 src2));
4783   effect(TEMP tmp, TEMP tmp2);
4784   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4785             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4786             "vextracti128  $tmp2,$tmp\n\t"
4787             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4788             "movd     $tmp2,$src1\n\t"
4789             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4790             "movd     $dst,$tmp2\t! add reduction8I" %}
4791   ins_encode %{
4792     int vector_len = 1;
4793     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4794     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4795     __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
4796     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4797     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4798     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4799     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4800   %}
4801   ins_pipe( pipe_slow );


4816             "movd    $dst,$tmp2\t! add reduction8I" %}
4817   ins_encode %{
4818     int vector_len = 0;
4819     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4820     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4821     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4822     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4823     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4824     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4825     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4826     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4827     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4828   %}
4829   ins_pipe( pipe_slow );
4830 %}
4831 
4832 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4833   predicate(UseAVX > 2);
4834   match(Set dst (AddReductionVI src1 src2));
4835   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4836   format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
4837             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4838             "vextracti128   $tmp,$tmp3\n\t"
4839             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4840             "pshufd  $tmp2,$tmp,0xE\n\t"
4841             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4842             "pshufd  $tmp2,$tmp,0x1\n\t"
4843             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4844             "movd    $tmp2,$src1\n\t"
4845             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4846             "movd    $dst,$tmp2\t! mul reduction16I" %}
4847   ins_encode %{
4848     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4849     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4850     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
4851     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4852     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4853     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4854     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4855     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4856     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4857     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4858     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4859   %}
4860   ins_pipe( pipe_slow );
4861 %}
4862 
4863 #ifdef _LP64
4864 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4865   predicate(UseAVX > 2);
4866   match(Set dst (AddReductionVL src1 src2));
4867   effect(TEMP tmp, TEMP tmp2);
4868   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4869             "vpaddq  $tmp,$src2,$tmp2\n\t"
4870             "movdq   $tmp2,$src1\n\t"
4871             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4872             "movdq   $dst,$tmp2\t! add reduction2L" %}
4873   ins_encode %{
4874     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4875     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4876     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4877     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4878     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4879   %}
4880   ins_pipe( pipe_slow );
4881 %}
4882 
4883 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4884   predicate(UseAVX > 2);
4885   match(Set dst (AddReductionVL src1 src2));
4886   effect(TEMP tmp, TEMP tmp2);
4887   format %{ "vextracti128  $tmp,$src2\n\t"
4888             "vpaddq  $tmp2,$tmp,$src2\n\t"
4889             "pshufd  $tmp,$tmp2,0xE\n\t"
4890             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4891             "movdq   $tmp,$src1\n\t"
4892             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4893             "movdq   $dst,$tmp2\t! add reduction4L" %}
4894   ins_encode %{
4895     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4896     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4897     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4898     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4899     __ movdq($tmp$$XMMRegister, $src1$$Register);
4900     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4901     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4902   %}
4903   ins_pipe( pipe_slow );
4904 %}
4905 
4906 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4907   predicate(UseAVX > 2);
4908   match(Set dst (AddReductionVL src1 src2));
4909   effect(TEMP tmp, TEMP tmp2);
4910   format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
4911             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4912             "vextracti128   $tmp,$tmp2\n\t"
4913             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4914             "pshufd  $tmp,$tmp2,0xE\n\t"
4915             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4916             "movdq   $tmp,$src1\n\t"
4917             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4918             "movdq   $dst,$tmp2\t! add reduction8L" %}
4919   ins_encode %{
4920     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4921     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4922     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
4923     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4924     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4925     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4926     __ movdq($tmp$$XMMRegister, $src1$$Register);
4927     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4928     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4929   %}
4930   ins_pipe( pipe_slow );
4931 %}
4932 #endif
4933 
4934 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4935   predicate(UseSSE >= 1 && UseAVX == 0);
4936   match(Set dst (AddReductionVF dst src2));
4937   effect(TEMP dst, TEMP tmp);
4938   format %{ "addss   $dst,$src2\n\t"
4939             "pshufd  $tmp,$src2,0x01\n\t"
4940             "addss   $dst,$tmp\t! add reduction2F" %}


4941   ins_encode %{
4942     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4943     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4944     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);


4945   %}
4946   ins_pipe( pipe_slow );
4947 %}
4948 
4949 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4950   predicate(UseAVX > 0);
4951   match(Set dst (AddReductionVF dst src2));
4952   effect(TEMP dst, TEMP tmp);
4953   format %{ "vaddss  $dst,$dst,$src2\n\t"
4954             "pshufd  $tmp,$src2,0x01\n\t"
4955             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4956   ins_encode %{
4957     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4958     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4959     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4965   predicate(UseSSE >= 1 && UseAVX == 0);
4966   match(Set dst (AddReductionVF dst src2));
4967   effect(TEMP dst, TEMP tmp);
4968   format %{ "addss   $dst,$src2\n\t"
4969             "pshufd  $tmp,$src2,0x01\n\t"
4970             "addss   $dst,$tmp\n\t"
4971             "pshufd  $tmp,$src2,0x02\n\t"
4972             "addss   $dst,$tmp\n\t"
4973             "pshufd  $tmp,$src2,0x03\n\t"
4974             "addss   $dst,$tmp\t! add reduction4F" %}
4975   ins_encode %{
4976     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4977     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4978     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4979     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4980     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4981     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4982     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);




4983   %}
4984   ins_pipe( pipe_slow );
4985 %}
4986 
4987 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4988   predicate(UseAVX > 0);
4989   match(Set dst (AddReductionVF dst src2));
4990   effect(TEMP tmp, TEMP dst);
4991   format %{ "vaddss  $dst,dst,$src2\n\t"
4992             "pshufd  $tmp,$src2,0x01\n\t"
4993             "vaddss  $dst,$dst,$tmp\n\t"
4994             "pshufd  $tmp,$src2,0x02\n\t"
4995             "vaddss  $dst,$dst,$tmp\n\t"
4996             "pshufd  $tmp,$src2,0x03\n\t"
4997             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
4998   ins_encode %{
4999     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5000     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5001     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5002     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5003     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5004     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5005     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5006   %}
5007   ins_pipe( pipe_slow );
5008 %}
5009 
5010 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5011   predicate(UseAVX > 0);
5012   match(Set dst (AddReductionVF dst src2));
5013   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5014   format %{ "vaddss  $dst,$dst,$src2\n\t"
5015             "pshufd  $tmp,$src2,0x01\n\t"
5016             "vaddss  $dst,$dst,$tmp\n\t"
5017             "pshufd  $tmp,$src2,0x02\n\t"
5018             "vaddss  $dst,$dst,$tmp\n\t"
5019             "pshufd  $tmp,$src2,0x03\n\t"
5020             "vaddss  $dst,$dst,$tmp\n\t"
5021             "vextractf128  $tmp2,$src2\n\t"
5022             "vaddss  $dst,$dst,$tmp2\n\t"
5023             "pshufd  $tmp,$tmp2,0x01\n\t"
5024             "vaddss  $dst,$dst,$tmp\n\t"
5025             "pshufd  $tmp,$tmp2,0x02\n\t"
5026             "vaddss  $dst,$dst,$tmp\n\t"
5027             "pshufd  $tmp,$tmp2,0x03\n\t"
5028             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5029   ins_encode %{
5030     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5031     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5032     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5033     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5034     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5035     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5036     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5037     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5038     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5039     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5040     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5041     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5042     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5043     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5044     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5045   %}
5046   ins_pipe( pipe_slow );
5047 %}
5048 
5049 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5050   predicate(UseAVX > 2);
5051   match(Set dst (AddReductionVF dst src2));
5052   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5053   format %{ "vaddss  $dst,$dst,$src2\n\t"
5054             "pshufd  $tmp,$src2,0x01\n\t"
5055             "vaddss  $dst,$dst,$tmp\n\t"
5056             "pshufd  $tmp,$src2,0x02\n\t"
5057             "vaddss  $dst,$dst,$tmp\n\t"
5058             "pshufd  $tmp,$src2,0x03\n\t"
5059             "vaddss  $dst,$dst,$tmp\n\t"
5060             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5061             "vaddss  $dst,$dst,$tmp2\n\t"
5062             "pshufd  $tmp,$tmp2,0x01\n\t"
5063             "vaddss  $dst,$dst,$tmp\n\t"
5064             "pshufd  $tmp,$tmp2,0x02\n\t"
5065             "vaddss  $dst,$dst,$tmp\n\t"
5066             "pshufd  $tmp,$tmp2,0x03\n\t"
5067             "vaddss  $dst,$dst,$tmp\n\t"
5068             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5069             "vaddss  $dst,$dst,$tmp2\n\t"
5070             "pshufd  $tmp,$tmp2,0x01\n\t"
5071             "vaddss  $dst,$dst,$tmp\n\t"
5072             "pshufd  $tmp,$tmp2,0x02\n\t"
5073             "vaddss  $dst,$dst,$tmp\n\t"
5074             "pshufd  $tmp,$tmp2,0x03\n\t"
5075             "vaddss  $dst,$dst,$tmp\n\t"
5076             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5077             "vaddss  $dst,$dst,$tmp2\n\t"
5078             "pshufd  $tmp,$tmp2,0x01\n\t"
5079             "vaddss  $dst,$dst,$tmp\n\t"
5080             "pshufd  $tmp,$tmp2,0x02\n\t"
5081             "vaddss  $dst,$dst,$tmp\n\t"
5082             "pshufd  $tmp,$tmp2,0x03\n\t"
5083             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5084   ins_encode %{
5085     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5086     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5087     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5088     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5089     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5090     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5091     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5092     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5093     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5094     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5095     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5096     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5097     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5098     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5099     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5100     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5101     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5102     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5103     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5104     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5105     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5106     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5107     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5108     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5109     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5110     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5111     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5112     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5113     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5114     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5115     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5116   %}
5117   ins_pipe( pipe_slow );
5118 %}
5119 
5120 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5121   predicate(UseSSE >= 1 && UseAVX == 0);
5122   match(Set dst (AddReductionVD dst src2));
5123   effect(TEMP tmp, TEMP dst);
5124   format %{ "addsd   $dst,$src2\n\t"
5125             "pshufd  $tmp,$src2,0xE\n\t"

5126             "addsd   $dst,$tmp\t! add reduction2D" %}
5127   ins_encode %{
5128     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5129     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);

5130     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5131   %}
5132   ins_pipe( pipe_slow );
5133 %}
5134 
5135 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5136   predicate(UseAVX > 0);
5137   match(Set dst (AddReductionVD dst src2));
5138   effect(TEMP tmp, TEMP dst);
5139   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5140             "pshufd  $tmp,$src2,0xE\n\t"
5141             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5142   ins_encode %{
5143     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5144     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5145     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5146   %}
5147   ins_pipe( pipe_slow );
5148 %}
5149 
5150 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5151   predicate(UseAVX > 0);
5152   match(Set dst (AddReductionVD dst src2));
5153   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5154   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5155             "pshufd  $tmp,$src2,0xE\n\t"
5156             "vaddsd  $dst,$dst,$tmp\n\t"
5157             "vextractf32x4h  $tmp2,$src2, 0x1\n\t"
5158             "vaddsd  $dst,$dst,$tmp2\n\t"
5159             "pshufd  $tmp,$tmp2,0xE\n\t"
5160             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5161   ins_encode %{
5162     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5163     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5164     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5165     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5166     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5167     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5168     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5169   %}
5170   ins_pipe( pipe_slow );
5171 %}
5172 
5173 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5174   predicate(UseAVX > 2);
5175   match(Set dst (AddReductionVD dst src2));
5176   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5177   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5178             "pshufd  $tmp,$src2,0xE\n\t"
5179             "vaddsd  $dst,$dst,$tmp\n\t"
5180             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5181             "vaddsd  $dst,$dst,$tmp2\n\t"
5182             "pshufd  $tmp,$tmp2,0xE\n\t"
5183             "vaddsd  $dst,$dst,$tmp\n\t"
5184             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5185             "vaddsd  $dst,$dst,$tmp2\n\t"
5186             "pshufd  $tmp,$tmp2,0xE\n\t"
5187             "vaddsd  $dst,$dst,$tmp\n\t"
5188             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5189             "vaddsd  $dst,$dst,$tmp2\n\t"
5190             "pshufd  $tmp,$tmp2,0xE\n\t"
5191             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5192   ins_encode %{
5193     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5194     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5195     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5196     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5197     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5198     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5199     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5200     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5201     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5202     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5203     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5204     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5205     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5206     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5207     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5208   %}
5209   ins_pipe( pipe_slow );
5210 %}
5211 
5212 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5213   predicate(UseSSE > 3 && UseAVX == 0);
5214   match(Set dst (MulReductionVI src1 src2));
5215   effect(TEMP tmp, TEMP tmp2);
5216   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5217             "pmulld  $tmp2,$src2\n\t"
5218             "movd    $tmp,$src1\n\t"
5219             "pmulld  $tmp2,$tmp\n\t"
5220             "movd    $dst,$tmp2\t! mul reduction2I" %}
5221   ins_encode %{
5222     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5223     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5224     __ movdl($tmp$$XMMRegister, $src1$$Register);
5225     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5226     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5227   %}


5310             "movd     $dst,$tmp2\t! mul reduction8I" %}
5311   ins_encode %{
5312     int vector_len = 0;
5313     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5314     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5315     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5316     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5317     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5318     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5319     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5320     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5321     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5322   %}
5323   ins_pipe( pipe_slow );
5324 %}
5325 
5326 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5327   predicate(UseAVX > 2);
5328   match(Set dst (MulReductionVI src1 src2));
5329   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5330   format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
5331             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5332             "vextracti128   $tmp,$tmp3\n\t"
5333             "vpmulld  $tmp,$tmp,$src2\n\t"
5334             "pshufd   $tmp2,$tmp,0xE\n\t"
5335             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5336             "pshufd   $tmp2,$tmp,0x1\n\t"
5337             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5338             "movd     $tmp2,$src1\n\t"
5339             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5340             "movd     $dst,$tmp2\t! mul reduction16I" %}
5341   ins_encode %{
5342     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5343     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5344     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
5345     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5346     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5347     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5348     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5349     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5350     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5351     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5352     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5353   %}
5354   ins_pipe( pipe_slow );
5355 %}
5356 
5357 #ifdef _LP64
5358 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5359   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5360   match(Set dst (MulReductionVL src1 src2));
5361   effect(TEMP tmp, TEMP tmp2);
5362   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5363             "vpmullq  $tmp,$src2,$tmp2\n\t"
5364             "movdq    $tmp2,$src1\n\t"
5365             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5366             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5367   ins_encode %{
5368     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5369     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5370     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5371     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5372     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5373   %}
5374   ins_pipe( pipe_slow );
5375 %}
5376 
5377 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5378   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5379   match(Set dst (MulReductionVL src1 src2));
5380   effect(TEMP tmp, TEMP tmp2);
5381   format %{ "vextracti128  $tmp,$src2\n\t"
5382             "vpmullq  $tmp2,$tmp,$src2\n\t"
5383             "pshufd   $tmp,$tmp2,0xE\n\t"
5384             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5385             "movdq    $tmp,$src1\n\t"
5386             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5387             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5388   ins_encode %{
5389     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5390     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5391     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5392     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5393     __ movdq($tmp$$XMMRegister, $src1$$Register);
5394     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5395     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5396   %}
5397   ins_pipe( pipe_slow );
5398 %}
5399 
5400 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5401   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5402   match(Set dst (MulReductionVL src1 src2));
5403   effect(TEMP tmp, TEMP tmp2);
5404   format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
5405             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5406             "vextracti128   $tmp,$tmp2\n\t"
5407             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5408             "pshufd   $tmp,$tmp2,0xE\n\t"
5409             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5410             "movdq    $tmp,$src1\n\t"
5411             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5412             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5413   ins_encode %{
5414     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5415     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5416     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
5417     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5418     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5419     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5420     __ movdq($tmp$$XMMRegister, $src1$$Register);
5421     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5422     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5423   %}
5424   ins_pipe( pipe_slow );
5425 %}
5426 #endif
5427 
5428 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5429   predicate(UseSSE >= 1 && UseAVX == 0);
5430   match(Set dst (MulReductionVF dst src2));
5431   effect(TEMP dst, TEMP tmp);
5432   format %{ "mulss   $dst,$src2\n\t"
5433             "pshufd  $tmp,$src2,0x01\n\t"
5434             "mulss   $dst,$tmp\t! mul reduction2F" %}


5435   ins_encode %{
5436     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5437     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5438     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);


5439   %}
5440   ins_pipe( pipe_slow );
5441 %}
5442 
5443 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5444   predicate(UseAVX > 0);
5445   match(Set dst (MulReductionVF dst src2));
5446   effect(TEMP tmp, TEMP dst);
5447   format %{ "vmulss  $dst,$dst,$src2\n\t"
5448             "pshufd  $tmp,$src2,0x01\n\t"
5449             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5450   ins_encode %{
5451     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5452     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5453     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5454   %}
5455   ins_pipe( pipe_slow );
5456 %}
5457 
5458 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5459   predicate(UseSSE >= 1 && UseAVX == 0);
5460   match(Set dst (MulReductionVF dst src2));
5461   effect(TEMP dst, TEMP tmp);
5462   format %{ "mulss   $dst,$src2\n\t"
5463             "pshufd  $tmp,$src2,0x01\n\t"
5464             "mulss   $dst,$tmp\n\t"
5465             "pshufd  $tmp,$src2,0x02\n\t"
5466             "mulss   $dst,$tmp\n\t"
5467             "pshufd  $tmp,$src2,0x03\n\t"
5468             "mulss   $dst,$tmp\t! mul reduction4F" %}
5469   ins_encode %{
5470     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5471     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5472     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5473     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5474     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5475     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5476     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);




5477   %}
5478   ins_pipe( pipe_slow );
5479 %}
5480 
5481 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5482   predicate(UseAVX > 0);
5483   match(Set dst (MulReductionVF dst src2));
5484   effect(TEMP tmp, TEMP dst);
5485   format %{ "vmulss  $dst,$dst,$src2\n\t"
5486             "pshufd  $tmp,$src2,0x01\n\t"
5487             "vmulss  $dst,$dst,$tmp\n\t"
5488             "pshufd  $tmp,$src2,0x02\n\t"
5489             "vmulss  $dst,$dst,$tmp\n\t"
5490             "pshufd  $tmp,$src2,0x03\n\t"
5491             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5492   ins_encode %{
5493     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5494     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5495     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5496     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5497     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5498     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5499     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5500   %}
5501   ins_pipe( pipe_slow );
5502 %}
5503 
5504 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5505   predicate(UseAVX > 0);
5506   match(Set dst (MulReductionVF dst src2));
5507   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5508   format %{ "vmulss  $dst,$dst,$src2\n\t"
5509             "pshufd  $tmp,$src2,0x01\n\t"
5510             "vmulss  $dst,$dst,$tmp\n\t"
5511             "pshufd  $tmp,$src2,0x02\n\t"
5512             "vmulss  $dst,$dst,$tmp\n\t"
5513             "pshufd  $tmp,$src2,0x03\n\t"
5514             "vmulss  $dst,$dst,$tmp\n\t"
5515             "vextractf128  $tmp2,$src2\n\t"
5516             "vmulss  $dst,$dst,$tmp2\n\t"
5517             "pshufd  $tmp,$tmp2,0x01\n\t"
5518             "vmulss  $dst,$dst,$tmp\n\t"
5519             "pshufd  $tmp,$tmp2,0x02\n\t"
5520             "vmulss  $dst,$dst,$tmp\n\t"
5521             "pshufd  $tmp,$tmp2,0x03\n\t"
5522             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5523   ins_encode %{
5524     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5525     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5526     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5527     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5528     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5529     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5530     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5531     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5532     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5533     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5534     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5535     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5536     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5537     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5538     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5539   %}
5540   ins_pipe( pipe_slow );
5541 %}
5542 
5543 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5544   predicate(UseAVX > 2);
5545   match(Set dst (MulReductionVF dst src2));
5546   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5547   format %{ "vmulss  $dst,$dst,$src2\n\t"
5548             "pshufd  $tmp,$src2,0x01\n\t"
5549             "vmulss  $dst,$dst,$tmp\n\t"
5550             "pshufd  $tmp,$src2,0x02\n\t"
5551             "vmulss  $dst,$dst,$tmp\n\t"
5552             "pshufd  $tmp,$src2,0x03\n\t"
5553             "vmulss  $dst,$dst,$tmp\n\t"
5554             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5555             "vmulss  $dst,$dst,$tmp2\n\t"
5556             "pshufd  $tmp,$tmp2,0x01\n\t"
5557             "vmulss  $dst,$dst,$tmp\n\t"
5558             "pshufd  $tmp,$tmp2,0x02\n\t"
5559             "vmulss  $dst,$dst,$tmp\n\t"
5560             "pshufd  $tmp,$tmp2,0x03\n\t"
5561             "vmulss  $dst,$dst,$tmp\n\t"
5562             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5563             "vmulss  $dst,$dst,$tmp2\n\t"
5564             "pshufd  $tmp,$tmp2,0x01\n\t"
5565             "vmulss  $dst,$dst,$tmp\n\t"
5566             "pshufd  $tmp,$tmp2,0x02\n\t"
5567             "vmulss  $dst,$dst,$tmp\n\t"
5568             "pshufd  $tmp,$tmp2,0x03\n\t"
5569             "vmulss  $dst,$dst,$tmp\n\t"
5570             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5571             "vmulss  $dst,$dst,$tmp2\n\t"
5572             "pshufd  $tmp,$tmp2,0x01\n\t"
5573             "vmulss  $dst,$dst,$tmp\n\t"
5574             "pshufd  $tmp,$tmp2,0x02\n\t"
5575             "vmulss  $dst,$dst,$tmp\n\t"
5576             "pshufd  $tmp,$tmp2,0x03\n\t"
5577             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5578   ins_encode %{
5579     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5580     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5581     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5582     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5583     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5584     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5585     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5586     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5587     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5588     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5589     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5590     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5591     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5592     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5593     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5594     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5595     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5596     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5597     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5598     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5599     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5600     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5601     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5602     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5603     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5604     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5605     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5606     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5607     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5608     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5609     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5610   %}
5611   ins_pipe( pipe_slow );
5612 %}
5613 
5614 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5615   predicate(UseSSE >= 1 && UseAVX == 0);
5616   match(Set dst (MulReductionVD dst src2));
5617   effect(TEMP dst, TEMP tmp);
5618   format %{ "mulsd   $dst,$src2\n\t"
5619             "pshufd  $tmp,$src2,0xE\n\t"

5620             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5621   ins_encode %{
5622     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5623     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);

5624     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5625   %}
5626   ins_pipe( pipe_slow );
5627 %}
5628 
5629 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5630   predicate(UseAVX > 0);
5631   match(Set dst (MulReductionVD dst src2));
5632   effect(TEMP tmp, TEMP dst);
5633   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5634             "pshufd  $tmp,$src2,0xE\n\t"
5635             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5636   ins_encode %{
5637     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5638     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5639     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5640   %}
5641   ins_pipe( pipe_slow );
5642 %}
5643 
5644 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5645   predicate(UseAVX > 0);
5646   match(Set dst (MulReductionVD dst src2));
5647   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5648   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5649             "pshufd  $tmp,$src2,0xE\n\t"
5650             "vmulsd  $dst,$dst,$tmp\n\t"
5651             "vextractf128  $tmp2,$src2\n\t"
5652             "vmulsd  $dst,$dst,$tmp2\n\t"
5653             "pshufd  $tmp,$tmp2,0xE\n\t"
5654             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5655   ins_encode %{
5656     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5657     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5658     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5659     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5660     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5661     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5662     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5663   %}
5664   ins_pipe( pipe_slow );
5665 %}
5666 
5667 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5668   predicate(UseAVX > 2);
5669   match(Set dst (MulReductionVD dst src2));
5670   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5671   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5672             "pshufd  $tmp,$src2,0xE\n\t"
5673             "vmulsd  $dst,$dst,$tmp\n\t"
5674             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5675             "vmulsd  $dst,$dst,$tmp2\n\t"
5676             "pshufd  $tmp,$src2,0xE\n\t"
5677             "vmulsd  $dst,$dst,$tmp\n\t"
5678             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5679             "vmulsd  $dst,$dst,$tmp2\n\t"
5680             "pshufd  $tmp,$tmp2,0xE\n\t"
5681             "vmulsd  $dst,$dst,$tmp\n\t"
5682             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5683             "vmulsd  $dst,$dst,$tmp2\n\t"
5684             "pshufd  $tmp,$tmp2,0xE\n\t"
5685             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5686   ins_encode %{
5687     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5688     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5689     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5690     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5691     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5692     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5693     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5694     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5695     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5696     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5697     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5698     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5699     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5700     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5701     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5702   %}
5703   ins_pipe( pipe_slow );
5704 %}
5705 
5706 // ====================VECTOR ARITHMETIC=======================================
5707 
5708 // --------------------------------- ADD --------------------------------------
5709 
5710 // Bytes vector add
5711 instruct vadd4B(vecS dst, vecS src) %{
5712   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5713   match(Set dst (AddVB dst src));
5714   format %{ "paddb   $dst,$src\t! add packed4B" %}
5715   ins_encode %{
5716     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5717   %}
5718   ins_pipe( pipe_slow );
5719 %}
5720 
5721 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5722   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5723   match(Set dst (AddVB src1 src2));
5724   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5725   ins_encode %{
5726     int vector_len = 0;
5727     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5728   %}
5729   ins_pipe( pipe_slow );
5730 %}
5731 
5732 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5733   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5734   match(Set dst (AddVB src1 src2));
5735   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5736   ins_encode %{
5737     int vector_len = 0;
5738     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5739   %}
5740   ins_pipe( pipe_slow );
5741 %}
5742 
5743 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5744   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5745   match(Set dst (AddVB dst src2));
5746   effect(TEMP src1);
5747   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5748   ins_encode %{
5749     int vector_len = 0;
5750     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5751   %}
5752   ins_pipe( pipe_slow );
5753 %}
5754 
5755 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5756   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5757   match(Set dst (AddVB src (LoadVector mem)));
5758   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5759   ins_encode %{
5760     int vector_len = 0;
5761     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5762   %}
5763   ins_pipe( pipe_slow );
5764 %}
5765 
5766 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5767   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5768   match(Set dst (AddVB src (LoadVector mem)));
5769   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5770   ins_encode %{
5771     int vector_len = 0;
5772     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5773   %}
5774   ins_pipe( pipe_slow );
5775 %}
5776 
5777 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5778   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5779   match(Set dst (AddVB dst (LoadVector mem)));
5780   effect(TEMP src);
5781   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5782   ins_encode %{
5783     int vector_len = 0;
5784     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5785   %}
5786   ins_pipe( pipe_slow );
5787 %}
5788 
5789 instruct vadd8B(vecD dst, vecD src) %{
5790   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5791   match(Set dst (AddVB dst src));
5792   format %{ "paddb   $dst,$src\t! add packed8B" %}
5793   ins_encode %{
5794     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5795   %}
5796   ins_pipe( pipe_slow );
5797 %}
5798 
5799 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
5800   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5801   match(Set dst (AddVB src1 src2));
5802   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5803   ins_encode %{
5804     int vector_len = 0;
5805     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5806   %}
5807   ins_pipe( pipe_slow );
5808 %}
5809 
5810 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5811   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5812   match(Set dst (AddVB src1 src2));
5813   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5814   ins_encode %{
5815     int vector_len = 0;
5816     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5817   %}
5818   ins_pipe( pipe_slow );
5819 %}
5820 
5821 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5822   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5823   match(Set dst (AddVB dst src2));
5824   effect(TEMP src1);
5825   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5826   ins_encode %{
5827     int vector_len = 0;
5828     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5829   %}
5830   ins_pipe( pipe_slow );
5831 %}
5832 
5833 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5834   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5835   match(Set dst (AddVB src (LoadVector mem)));
5836   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5837   ins_encode %{
5838     int vector_len = 0;
5839     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5840   %}
5841   ins_pipe( pipe_slow );
5842 %}
5843 
5844 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5845   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5846   match(Set dst (AddVB src (LoadVector mem)));
5847   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5848   ins_encode %{
5849     int vector_len = 0;
5850     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5851   %}
5852   ins_pipe( pipe_slow );
5853 %}
5854 
5855 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5856   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5857   match(Set dst (AddVB dst (LoadVector mem)));
5858   effect(TEMP src);
5859   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5860   ins_encode %{
5861     int vector_len = 0;
5862     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5863   %}
5864   ins_pipe( pipe_slow );
5865 %}
5866 
5867 instruct vadd16B(vecX dst, vecX src) %{
5868   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5869   match(Set dst (AddVB dst src));
5870   format %{ "paddb   $dst,$src\t! add packed16B" %}

5871   ins_encode %{
5872     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5873   %}
5874   ins_pipe( pipe_slow );
5875 %}
5876 
5877 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5878   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5879   match(Set dst (AddVB src1 src2));
5880   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5881   ins_encode %{
5882     int vector_len = 0;
5883     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5884   %}
5885   ins_pipe( pipe_slow );
5886 %}
5887 
5888 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5889   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5890   match(Set dst (AddVB src1 src2));
5891   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5892   ins_encode %{
5893     int vector_len = 0;
5894     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5895   %}
5896   ins_pipe( pipe_slow );
5897 %}
5898 
5899 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5900   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5901   match(Set dst (AddVB dst src2));
5902   effect(TEMP src1);
5903   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5904   ins_encode %{
5905     int vector_len = 0;
5906     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5907   %}
5908   ins_pipe( pipe_slow );
5909 %}
5910 
5911 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5912   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5913   match(Set dst (AddVB src (LoadVector mem)));
5914   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5915   ins_encode %{
5916     int vector_len = 0;
5917     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5918   %}
5919   ins_pipe( pipe_slow );
5920 %}
5921 
5922 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5923   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5924   match(Set dst (AddVB src (LoadVector mem)));
5925   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5926   ins_encode %{
5927     int vector_len = 0;
5928     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5929   %}
5930   ins_pipe( pipe_slow );
5931 %}
5932 
5933 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5934   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5935   match(Set dst (AddVB dst (LoadVector mem)));
5936   effect(TEMP src);
5937   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5938   ins_encode %{
5939     int vector_len = 0;
5940     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5941   %}
5942   ins_pipe( pipe_slow );
5943 %}
5944 
5945 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5946   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5947   match(Set dst (AddVB src1 src2));
5948   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5949   ins_encode %{
5950     int vector_len = 1;
5951     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5952   %}
5953   ins_pipe( pipe_slow );
5954 %}
5955 
5956 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5957   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5958   match(Set dst (AddVB src1 src2));
5959   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5960   ins_encode %{
5961     int vector_len = 1;
5962     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5963   %}
5964   ins_pipe( pipe_slow );
5965 %}
5966 
5967 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5968   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5969   match(Set dst (AddVB dst src2));
5970   effect(TEMP src1);
5971   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5972   ins_encode %{
5973     int vector_len = 1;
5974     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5975   %}
5976   ins_pipe( pipe_slow );
5977 %}
5978 
5979 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5980   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5981   match(Set dst (AddVB src (LoadVector mem)));
5982   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5983   ins_encode %{
5984     int vector_len = 1;
5985     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5986   %}
5987   ins_pipe( pipe_slow );
5988 %}
5989 
5990 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5991   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5992   match(Set dst (AddVB src (LoadVector mem)));
5993   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5994   ins_encode %{
5995     int vector_len = 1;
5996     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5997   %}
5998   ins_pipe( pipe_slow );
5999 %}
6000 
6001 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6002   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6003   match(Set dst (AddVB dst (LoadVector mem)));
6004   effect(TEMP src);
6005   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6006   ins_encode %{
6007     int vector_len = 1;
6008     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6009   %}
6010   ins_pipe( pipe_slow );
6011 %}
6012 
6013 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6014   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6015   match(Set dst (AddVB src1 src2));
6016   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}

6017   ins_encode %{
6018     int vector_len = 2;
6019     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6020   %}
6021   ins_pipe( pipe_slow );
6022 %}
6023 
6024 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6025   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6026   match(Set dst (AddVB src (LoadVector mem)));
6027   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6028   ins_encode %{
6029     int vector_len = 2;
6030     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6031   %}
6032   ins_pipe( pipe_slow );
6033 %}
6034 
6035 // Shorts/Chars vector add
6036 instruct vadd2S(vecS dst, vecS src) %{
6037   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6038   match(Set dst (AddVS dst src));
6039   format %{ "paddw   $dst,$src\t! add packed2S" %}
6040   ins_encode %{
6041     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

6042   %}
6043   ins_pipe( pipe_slow );
6044 %}
6045 
6046 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6047   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6048   match(Set dst (AddVS src1 src2));
6049   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6050   ins_encode %{
6051     int vector_len = 0;
6052     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6053   %}
6054   ins_pipe( pipe_slow );
6055 %}
6056 
6057 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6058   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6059   match(Set dst (AddVS src1 src2));
6060   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6061   ins_encode %{
6062     int vector_len = 0;
6063     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6064   %}
6065   ins_pipe( pipe_slow );
6066 %}
6067 
6068 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6069   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6070   match(Set dst (AddVS dst src2));
6071   effect(TEMP src1);
6072   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
6073   ins_encode %{
6074     int vector_len = 0;
6075     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6076   %}
6077   ins_pipe( pipe_slow );
6078 %}
6079 
6080 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
6081   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6082   match(Set dst (AddVS src (LoadVector mem)));
6083   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6084   ins_encode %{
6085     int vector_len = 0;
6086     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6087   %}
6088   ins_pipe( pipe_slow );
6089 %}
6090 
6091 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
6092   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6093   match(Set dst (AddVS src (LoadVector mem)));
6094   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6095   ins_encode %{
6096     int vector_len = 0;
6097     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6098   %}
6099   ins_pipe( pipe_slow );
6100 %}
6101 
6102 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6103   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6104   match(Set dst (AddVS dst (LoadVector mem)));
6105   effect(TEMP src);
6106   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6107   ins_encode %{
6108     int vector_len = 0;
6109     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6110   %}
6111   ins_pipe( pipe_slow );
6112 %}
6113 
6114 instruct vadd4S(vecD dst, vecD src) %{
6115   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6116   match(Set dst (AddVS dst src));
6117   format %{ "paddw   $dst,$src\t! add packed4S" %}
6118   ins_encode %{
6119     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

6120   %}
6121   ins_pipe( pipe_slow );
6122 %}
6123 
6124 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
6125   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6126   match(Set dst (AddVS src1 src2));
6127   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}

6128   ins_encode %{
6129     int vector_len = 0;
6130     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6131   %}
6132   ins_pipe( pipe_slow );
6133 %}
6134 
6135 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6136   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6137   match(Set dst (AddVS src1 src2));
6138   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6139   ins_encode %{
6140     int vector_len = 0;
6141     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6142   %}
6143   ins_pipe( pipe_slow );
6144 %}
6145 
6146 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6147   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6148   match(Set dst (AddVS dst src2));
6149   effect(TEMP src1);
6150   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
6151   ins_encode %{
6152     int vector_len = 0;
6153     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6154   %}
6155   ins_pipe( pipe_slow );
6156 %}
6157 
6158 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
6159   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6160   match(Set dst (AddVS src (LoadVector mem)));
6161   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6162   ins_encode %{
6163     int vector_len = 0;
6164     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6165   %}
6166   ins_pipe( pipe_slow );
6167 %}
6168 
6169 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
6170   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6171   match(Set dst (AddVS src (LoadVector mem)));
6172   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6173   ins_encode %{
6174     int vector_len = 0;
6175     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6176   %}
6177   ins_pipe( pipe_slow );
6178 %}
6179 
6180 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6181   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6182   match(Set dst (AddVS dst (LoadVector mem)));
6183   effect(TEMP src);
6184   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6185   ins_encode %{
6186     int vector_len = 0;
6187     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6188   %}
6189   ins_pipe( pipe_slow );
6190 %}
6191 
6192 instruct vadd8S(vecX dst, vecX src) %{
6193   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6194   match(Set dst (AddVS dst src));
6195   format %{ "paddw   $dst,$src\t! add packed8S" %}
6196   ins_encode %{
6197     __ paddw($dst$$XMMRegister, $src$$XMMRegister);

6198   %}
6199   ins_pipe( pipe_slow );
6200 %}
6201 
6202 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6203   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6204   match(Set dst (AddVS src1 src2));
6205   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}

6206   ins_encode %{
6207     int vector_len = 0;
6208     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6209   %}
6210   ins_pipe( pipe_slow );
6211 %}
6212 
6213 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6214   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6215   match(Set dst (AddVS src1 src2));
6216   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6217   ins_encode %{
6218     int vector_len = 0;
6219     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6220   %}
6221   ins_pipe( pipe_slow );
6222 %}
6223 
6224 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6225   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6226   match(Set dst (AddVS dst src2));
6227   effect(TEMP src1);
6228   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
6229   ins_encode %{
6230     int vector_len = 0;
6231     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6232   %}
6233   ins_pipe( pipe_slow );
6234 %}
6235 
6236 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
6237   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6238   match(Set dst (AddVS src (LoadVector mem)));
6239   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6240   ins_encode %{
6241     int vector_len = 0;
6242     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6243   %}
6244   ins_pipe( pipe_slow );
6245 %}
6246 
6247 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
6248   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6249   match(Set dst (AddVS src (LoadVector mem)));
6250   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6251   ins_encode %{
6252     int vector_len = 0;
6253     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6254   %}
6255   ins_pipe( pipe_slow );
6256 %}
6257 
6258 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
6259   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6260   match(Set dst (AddVS dst (LoadVector mem)));
6261   effect(TEMP src);
6262   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6263   ins_encode %{
6264     int vector_len = 0;
6265     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6266   %}
6267   ins_pipe( pipe_slow );
6268 %}
6269 
6270 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6271   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6272   match(Set dst (AddVS src1 src2));
6273   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6274   ins_encode %{
6275     int vector_len = 1;
6276     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6277   %}
6278   ins_pipe( pipe_slow );
6279 %}
6280 
6281 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6282   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6283   match(Set dst (AddVS src1 src2));
6284   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6285   ins_encode %{
6286     int vector_len = 1;
6287     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6288   %}
6289   ins_pipe( pipe_slow );
6290 %}
6291 
6292 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6293   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6294   match(Set dst (AddVS dst src2));
6295   effect(TEMP src1);
6296   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6297   ins_encode %{
6298     int vector_len = 1;
6299     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6300   %}
6301   ins_pipe( pipe_slow );
6302 %}
6303 
6304 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6305   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6306   match(Set dst (AddVS src (LoadVector mem)));
6307   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6308   ins_encode %{
6309     int vector_len = 1;
6310     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6311   %}
6312   ins_pipe( pipe_slow );
6313 %}
6314 
6315 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6316   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6317   match(Set dst (AddVS src (LoadVector mem)));
6318   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6319   ins_encode %{
6320     int vector_len = 1;
6321     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6322   %}
6323   ins_pipe( pipe_slow );
6324 %}
6325 
6326 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6327   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6328   match(Set dst (AddVS dst (LoadVector mem)));
6329   effect(TEMP src);
6330   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6331   ins_encode %{
6332     int vector_len = 1;
6333     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6334   %}
6335   ins_pipe( pipe_slow );
6336 %}
6337 
6338 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6339   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6340   match(Set dst (AddVS src1 src2));
6341   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6342   ins_encode %{
6343     int vector_len = 2;
6344     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6345   %}
6346   ins_pipe( pipe_slow );
6347 %}
6348 
6349 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6350   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6351   match(Set dst (AddVS src (LoadVector mem)));
6352   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6353   ins_encode %{
6354     int vector_len = 2;
6355     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6356   %}
6357   ins_pipe( pipe_slow );
6358 %}
6359 
6360 // Integers vector add
6361 instruct vadd2I(vecD dst, vecD src) %{
6362   predicate(n->as_Vector()->length() == 2);
6363   match(Set dst (AddVI dst src));
6364   format %{ "paddd   $dst,$src\t! add packed2I" %}
6365   ins_encode %{
6366     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6367   %}
6368   ins_pipe( pipe_slow );
6369 %}
6370 
6371 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6372   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6373   match(Set dst (AddVI src1 src2));
6374   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6375   ins_encode %{
6376     int vector_len = 0;
6377     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6378   %}
6379   ins_pipe( pipe_slow );
6380 %}
6381 
6382 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6383   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6384   match(Set dst (AddVI src (LoadVector mem)));
6385   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6386   ins_encode %{
6387     int vector_len = 0;
6388     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6389   %}
6390   ins_pipe( pipe_slow );
6391 %}
6392 
6393 instruct vadd4I(vecX dst, vecX src) %{
6394   predicate(n->as_Vector()->length() == 4);
6395   match(Set dst (AddVI dst src));
6396   format %{ "paddd   $dst,$src\t! add packed4I" %}
6397   ins_encode %{
6398     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6399   %}
6400   ins_pipe( pipe_slow );
6401 %}
6402 
6403 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6404   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6405   match(Set dst (AddVI src1 src2));
6406   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6407   ins_encode %{
6408     int vector_len = 0;
6409     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6410   %}
6411   ins_pipe( pipe_slow );
6412 %}
6413 
6414 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6415   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6416   match(Set dst (AddVI src (LoadVector mem)));
6417   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6418   ins_encode %{
6419     int vector_len = 0;
6420     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6421   %}
6422   ins_pipe( pipe_slow );
6423 %}
6424 
6425 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6426   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6427   match(Set dst (AddVI src1 src2));
6428   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6429   ins_encode %{
6430     int vector_len = 1;
6431     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6432   %}
6433   ins_pipe( pipe_slow );
6434 %}
6435 
6436 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6437   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6438   match(Set dst (AddVI src (LoadVector mem)));
6439   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6440   ins_encode %{
6441     int vector_len = 1;
6442     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6443   %}
6444   ins_pipe( pipe_slow );
6445 %}
6446 
6447 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6448   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6449   match(Set dst (AddVI src1 src2));
6450   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6451   ins_encode %{
6452     int vector_len = 2;
6453     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6454   %}
6455   ins_pipe( pipe_slow );
6456 %}
6457 
6458 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6459   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6460   match(Set dst (AddVI src (LoadVector mem)));
6461   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6462   ins_encode %{
6463     int vector_len = 2;
6464     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6465   %}
6466   ins_pipe( pipe_slow );
6467 %}
6468 
6469 // Longs vector add
6470 instruct vadd2L(vecX dst, vecX src) %{
6471   predicate(n->as_Vector()->length() == 2);
6472   match(Set dst (AddVL dst src));
6473   format %{ "paddq   $dst,$src\t! add packed2L" %}
6474   ins_encode %{
6475     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6476   %}
6477   ins_pipe( pipe_slow );
6478 %}
6479 
6480 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6481   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6482   match(Set dst (AddVL src1 src2));
6483   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6484   ins_encode %{
6485     int vector_len = 0;
6486     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6487   %}
6488   ins_pipe( pipe_slow );
6489 %}
6490 
6491 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6492   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6493   match(Set dst (AddVL src (LoadVector mem)));
6494   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6495   ins_encode %{
6496     int vector_len = 0;
6497     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6498   %}
6499   ins_pipe( pipe_slow );
6500 %}
6501 
6502 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6503   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6504   match(Set dst (AddVL src1 src2));
6505   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6506   ins_encode %{
6507     int vector_len = 1;
6508     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6509   %}
6510   ins_pipe( pipe_slow );
6511 %}
6512 
6513 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6514   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6515   match(Set dst (AddVL src (LoadVector mem)));
6516   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6517   ins_encode %{
6518     int vector_len = 1;
6519     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6520   %}
6521   ins_pipe( pipe_slow );
6522 %}
6523 
6524 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6525   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6526   match(Set dst (AddVL src1 src2));
6527   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6528   ins_encode %{
6529     int vector_len = 2;
6530     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6531   %}
6532   ins_pipe( pipe_slow );
6533 %}
6534 
6535 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6536   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6537   match(Set dst (AddVL src (LoadVector mem)));
6538   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6539   ins_encode %{
6540     int vector_len = 2;
6541     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6542   %}
6543   ins_pipe( pipe_slow );
6544 %}
6545 
6546 // Floats vector add
6547 instruct vadd2F(vecD dst, vecD src) %{
6548   predicate(n->as_Vector()->length() == 2);
6549   match(Set dst (AddVF dst src));
6550   format %{ "addps   $dst,$src\t! add packed2F" %}
6551   ins_encode %{
6552     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6553   %}
6554   ins_pipe( pipe_slow );
6555 %}
6556 
6557 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6558   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6559   match(Set dst (AddVF src1 src2));
6560   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6561   ins_encode %{
6562     int vector_len = 0;
6563     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6564   %}
6565   ins_pipe( pipe_slow );
6566 %}
6567 
6568 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6569   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6570   match(Set dst (AddVF src (LoadVector mem)));
6571   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6572   ins_encode %{
6573     int vector_len = 0;
6574     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6575   %}
6576   ins_pipe( pipe_slow );
6577 %}
6578 
6579 instruct vadd4F(vecX dst, vecX src) %{
6580   predicate(n->as_Vector()->length() == 4);
6581   match(Set dst (AddVF dst src));
6582   format %{ "addps   $dst,$src\t! add packed4F" %}
6583   ins_encode %{
6584     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6585   %}
6586   ins_pipe( pipe_slow );
6587 %}
6588 
6589 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6590   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6591   match(Set dst (AddVF src1 src2));
6592   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6593   ins_encode %{
6594     int vector_len = 0;
6595     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6596   %}
6597   ins_pipe( pipe_slow );
6598 %}
6599 
6600 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6601   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6602   match(Set dst (AddVF src (LoadVector mem)));
6603   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6604   ins_encode %{
6605     int vector_len = 0;
6606     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6607   %}
6608   ins_pipe( pipe_slow );
6609 %}
6610 
6611 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6612   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6613   match(Set dst (AddVF src1 src2));
6614   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6615   ins_encode %{
6616     int vector_len = 1;
6617     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6618   %}
6619   ins_pipe( pipe_slow );
6620 %}
6621 
6622 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6623   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6624   match(Set dst (AddVF src (LoadVector mem)));
6625   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6626   ins_encode %{
6627     int vector_len = 1;
6628     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6629   %}
6630   ins_pipe( pipe_slow );
6631 %}
6632 
6633 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6634   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6635   match(Set dst (AddVF src1 src2));
6636   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6637   ins_encode %{
6638     int vector_len = 2;
6639     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6640   %}
6641   ins_pipe( pipe_slow );
6642 %}
6643 
6644 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6645   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6646   match(Set dst (AddVF src (LoadVector mem)));
6647   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6648   ins_encode %{
6649     int vector_len = 2;
6650     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6651   %}
6652   ins_pipe( pipe_slow );
6653 %}
6654 
6655 // Doubles vector add
6656 instruct vadd2D(vecX dst, vecX src) %{
6657   predicate(n->as_Vector()->length() == 2);
6658   match(Set dst (AddVD dst src));
6659   format %{ "addpd   $dst,$src\t! add packed2D" %}
6660   ins_encode %{
6661     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6662   %}
6663   ins_pipe( pipe_slow );
6664 %}
6665 
6666 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6667   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6668   match(Set dst (AddVD src1 src2));
6669   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6670   ins_encode %{
6671     int vector_len = 0;
6672     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6673   %}
6674   ins_pipe( pipe_slow );
6675 %}
6676 
6677 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6678   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6679   match(Set dst (AddVD src (LoadVector mem)));
6680   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6681   ins_encode %{
6682     int vector_len = 0;
6683     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6684   %}
6685   ins_pipe( pipe_slow );
6686 %}
6687 
6688 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6689   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6690   match(Set dst (AddVD src1 src2));
6691   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6692   ins_encode %{
6693     int vector_len = 1;
6694     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6695   %}
6696   ins_pipe( pipe_slow );
6697 %}
6698 
6699 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6700   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6701   match(Set dst (AddVD src (LoadVector mem)));
6702   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6703   ins_encode %{
6704     int vector_len = 1;
6705     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6706   %}
6707   ins_pipe( pipe_slow );
6708 %}
6709 
6710 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6711   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6712   match(Set dst (AddVD src1 src2));
6713   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6714   ins_encode %{
6715     int vector_len = 2;
6716     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6717   %}
6718   ins_pipe( pipe_slow );
6719 %}
6720 
6721 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6722   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6723   match(Set dst (AddVD src (LoadVector mem)));
6724   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6725   ins_encode %{
6726     int vector_len = 2;
6727     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6728   %}
6729   ins_pipe( pipe_slow );
6730 %}
6731 
6732 // --------------------------------- SUB --------------------------------------
6733 
6734 // Bytes vector sub
6735 instruct vsub4B(vecS dst, vecS src) %{
6736   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6737   match(Set dst (SubVB dst src));
6738   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6739   ins_encode %{
6740     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6741   %}
6742   ins_pipe( pipe_slow );
6743 %}
6744 
6745 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6746   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6747   match(Set dst (SubVB src1 src2));
6748   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6749   ins_encode %{
6750     int vector_len = 0;
6751     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6752   %}
6753   ins_pipe( pipe_slow );
6754 %}
6755 
6756 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6757   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6758   match(Set dst (SubVB src1 src2));
6759   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6760   ins_encode %{
6761     int vector_len = 0;
6762     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6763   %}
6764   ins_pipe( pipe_slow );
6765 %}
6766 
6767 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6768   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6769   match(Set dst (SubVB dst src2));
6770   effect(TEMP src1);
6771   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6772   ins_encode %{
6773     int vector_len = 0;
6774     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6775   %}
6776   ins_pipe( pipe_slow );
6777 %}
6778 
6779 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6780   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6781   match(Set dst (SubVB src (LoadVector mem)));
6782   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6783   ins_encode %{
6784     int vector_len = 0;
6785     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6786   %}
6787   ins_pipe( pipe_slow );
6788 %}
6789 
6790 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6791   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6792   match(Set dst (SubVB src (LoadVector mem)));
6793   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6794   ins_encode %{
6795     int vector_len = 0;
6796     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6797   %}
6798   ins_pipe( pipe_slow );
6799 %}
6800 
6801 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6802   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6803   match(Set dst (SubVB dst (LoadVector mem)));
6804   effect(TEMP src);
6805   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6806   ins_encode %{
6807     int vector_len = 0;
6808     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6809   %}
6810   ins_pipe( pipe_slow );
6811 %}
6812 
6813 instruct vsub8B(vecD dst, vecD src) %{
6814   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6815   match(Set dst (SubVB dst src));
6816   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6817   ins_encode %{
6818     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6819   %}
6820   ins_pipe( pipe_slow );
6821 %}
6822 
6823 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6824   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6825   match(Set dst (SubVB src1 src2));
6826   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6827   ins_encode %{
6828     int vector_len = 0;
6829     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6830   %}
6831   ins_pipe( pipe_slow );
6832 %}
6833 
6834 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6835   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6836   match(Set dst (SubVB src1 src2));
6837   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6838   ins_encode %{
6839     int vector_len = 0;
6840     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6841   %}
6842   ins_pipe( pipe_slow );
6843 %}
6844 
6845 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6846   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6847   match(Set dst (SubVB dst src2));
6848   effect(TEMP src1);
6849   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6850   ins_encode %{
6851     int vector_len = 0;
6852     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6853   %}
6854   ins_pipe( pipe_slow );
6855 %}
6856 
6857 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6858   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6859   match(Set dst (SubVB src (LoadVector mem)));
6860   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6861   ins_encode %{
6862     int vector_len = 0;
6863     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6864   %}
6865   ins_pipe( pipe_slow );
6866 %}
6867 
6868 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6869   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6870   match(Set dst (SubVB src (LoadVector mem)));
6871   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6872   ins_encode %{
6873     int vector_len = 0;
6874     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6875   %}
6876   ins_pipe( pipe_slow );
6877 %}
6878 
6879 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6880   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6881   match(Set dst (SubVB dst (LoadVector mem)));
6882   effect(TEMP src);
6883   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6884   ins_encode %{
6885     int vector_len = 0;
6886     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6887   %}
6888   ins_pipe( pipe_slow );
6889 %}
6890 
6891 instruct vsub16B(vecX dst, vecX src) %{
6892   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6893   match(Set dst (SubVB dst src));
6894   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6895   ins_encode %{
6896     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6897   %}
6898   ins_pipe( pipe_slow );
6899 %}
6900 
6901 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6902   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6903   match(Set dst (SubVB src1 src2));
6904   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6905   ins_encode %{
6906     int vector_len = 0;
6907     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6908   %}
6909   ins_pipe( pipe_slow );
6910 %}
6911 
6912 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6913   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6914   match(Set dst (SubVB src1 src2));
6915   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6916   ins_encode %{
6917     int vector_len = 0;
6918     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6919   %}
6920   ins_pipe( pipe_slow );
6921 %}
6922 
6923 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6924   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6925   match(Set dst (SubVB dst src2));
6926   effect(TEMP src1);
6927   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6928   ins_encode %{
6929     int vector_len = 0;
6930     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6931   %}
6932   ins_pipe( pipe_slow );
6933 %}
6934 
6935 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6936   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6937   match(Set dst (SubVB src (LoadVector mem)));
6938   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6939   ins_encode %{
6940     int vector_len = 0;
6941     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6942   %}
6943   ins_pipe( pipe_slow );
6944 %}
6945 
6946 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6947   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6948   match(Set dst (SubVB src (LoadVector mem)));
6949   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6950   ins_encode %{
6951     int vector_len = 0;
6952     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6953   %}
6954   ins_pipe( pipe_slow );
6955 %}
6956 
6957 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6958   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6959   match(Set dst (SubVB dst (LoadVector mem)));
6960   effect(TEMP src);
6961   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6962   ins_encode %{
6963     int vector_len = 0;
6964     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6965   %}
6966   ins_pipe( pipe_slow );
6967 %}
6968 
6969 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6970   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6971   match(Set dst (SubVB src1 src2));
6972   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6973   ins_encode %{
6974     int vector_len = 1;
6975     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6976   %}
6977   ins_pipe( pipe_slow );
6978 %}
6979 
6980 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6981   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6982   match(Set dst (SubVB src1 src2));
6983   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6984   ins_encode %{
6985     int vector_len = 1;
6986     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6987   %}
6988   ins_pipe( pipe_slow );
6989 %}
6990 
6991 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6992   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6993   match(Set dst (SubVB dst src2));
6994   effect(TEMP src1);
6995   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6996   ins_encode %{
6997     int vector_len = 1;
6998     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6999   %}
7000   ins_pipe( pipe_slow );
7001 %}
7002 
7003 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
7004   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
7005   match(Set dst (SubVB src (LoadVector mem)));
7006   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7007   ins_encode %{
7008     int vector_len = 1;
7009     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7010   %}
7011   ins_pipe( pipe_slow );
7012 %}
7013 
7014 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
7015   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7016   match(Set dst (SubVB src (LoadVector mem)));
7017   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7018   ins_encode %{
7019     int vector_len = 1;
7020     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7021   %}
7022   ins_pipe( pipe_slow );
7023 %}
7024 
7025 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
7026   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
7027   match(Set dst (SubVB dst (LoadVector mem)));
7028   effect(TEMP src);
7029   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7030   ins_encode %{
7031     int vector_len = 1;
7032     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7033   %}
7034   ins_pipe( pipe_slow );
7035 %}
7036 
7037 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
7038   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7039   match(Set dst (SubVB src1 src2));
7040   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
7041   ins_encode %{
7042     int vector_len = 2;
7043     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7044   %}
7045   ins_pipe( pipe_slow );
7046 %}
7047 
7048 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
7049   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7050   match(Set dst (SubVB src (LoadVector mem)));
7051   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
7052   ins_encode %{
7053     int vector_len = 2;
7054     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7055   %}
7056   ins_pipe( pipe_slow );
7057 %}
7058 
7059 // Shorts/Chars vector sub
7060 instruct vsub2S(vecS dst, vecS src) %{
7061   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7062   match(Set dst (SubVS dst src));
7063   format %{ "psubw   $dst,$src\t! sub packed2S" %}
7064   ins_encode %{
7065     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7066   %}
7067   ins_pipe( pipe_slow );
7068 %}
7069 
7070 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7071   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7072   match(Set dst (SubVS src1 src2));
7073   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7074   ins_encode %{
7075     int vector_len = 0;
7076     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7077   %}
7078   ins_pipe( pipe_slow );
7079 %}
7080 
7081 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7082   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7083   match(Set dst (SubVS src1 src2));
7084   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7085   ins_encode %{
7086     int vector_len = 0;
7087     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7088   %}
7089   ins_pipe( pipe_slow );
7090 %}
7091 
7092 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
7093   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7094   match(Set dst (SubVS dst src2));
7095   effect(TEMP src1);
7096   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7097   ins_encode %{
7098     int vector_len = 0;
7099     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7100   %}
7101   ins_pipe( pipe_slow );
7102 %}
7103 
7104 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
7105   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7106   match(Set dst (SubVS src (LoadVector mem)));
7107   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7108   ins_encode %{
7109     int vector_len = 0;
7110     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7111   %}
7112   ins_pipe( pipe_slow );
7113 %}
7114 
7115 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
7116   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7117   match(Set dst (SubVS src (LoadVector mem)));
7118   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7119   ins_encode %{
7120     int vector_len = 0;
7121     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7122   %}
7123   ins_pipe( pipe_slow );
7124 %}
7125 
7126 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7127   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7128   match(Set dst (SubVS dst (LoadVector mem)));
7129   effect(TEMP src);
7130   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7131   ins_encode %{
7132     int vector_len = 0;
7133     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7134   %}
7135   ins_pipe( pipe_slow );
7136 %}
7137 
7138 instruct vsub4S(vecD dst, vecD src) %{
7139   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7140   match(Set dst (SubVS dst src));
7141   format %{ "psubw   $dst,$src\t! sub packed4S" %}
7142   ins_encode %{
7143     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7144   %}
7145   ins_pipe( pipe_slow );
7146 %}
7147 
7148 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7149   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7150   match(Set dst (SubVS src1 src2));
7151   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7152   ins_encode %{
7153     int vector_len = 0;
7154     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7155   %}
7156   ins_pipe( pipe_slow );
7157 %}
7158 
7159 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7160   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7161   match(Set dst (SubVS src1 src2));
7162   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7163   ins_encode %{
7164     int vector_len = 0;
7165     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7166   %}
7167   ins_pipe( pipe_slow );
7168 %}
7169 
7170 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7171   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7172   match(Set dst (SubVS dst src2));
7173   effect(TEMP src1);
7174   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7175   ins_encode %{
7176     int vector_len = 0;
7177     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7178   %}
7179   ins_pipe( pipe_slow );
7180 %}
7181 
7182 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
7183   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7184   match(Set dst (SubVS src (LoadVector mem)));
7185   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7186   ins_encode %{
7187     int vector_len = 0;
7188     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7189   %}
7190   ins_pipe( pipe_slow );
7191 %}
7192 
7193 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
7194   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7195   match(Set dst (SubVS src (LoadVector mem)));
7196   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7197   ins_encode %{
7198     int vector_len = 0;
7199     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7200   %}
7201   ins_pipe( pipe_slow );
7202 %}
7203 
7204 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7205   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7206   match(Set dst (SubVS dst (LoadVector mem)));
7207   effect(TEMP src);
7208   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7209   ins_encode %{
7210     int vector_len = 0;
7211     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7212   %}
7213   ins_pipe( pipe_slow );
7214 %}
7215 
7216 instruct vsub8S(vecX dst, vecX src) %{
7217   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);

7218   match(Set dst (SubVS dst src));
7219   format %{ "psubw   $dst,$src\t! sub packed8S" %}
7220   ins_encode %{
7221     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7222   %}
7223   ins_pipe( pipe_slow );
7224 %}
7225 
7226 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7227   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7228   match(Set dst (SubVS src1 src2));
7229   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7230   ins_encode %{
7231     int vector_len = 0;
7232     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7233   %}
7234   ins_pipe( pipe_slow );
7235 %}
7236 
7237 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7238   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7239   match(Set dst (SubVS src1 src2));
7240   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7241   ins_encode %{
7242     int vector_len = 0;
7243     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7244   %}
7245   ins_pipe( pipe_slow );
7246 %}
7247 
7248 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7249   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7250   match(Set dst (SubVS dst src2));
7251   effect(TEMP src1);
7252   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7253   ins_encode %{
7254     int vector_len = 0;
7255     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7256   %}
7257   ins_pipe( pipe_slow );
7258 %}
7259 
7260 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
7261   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7262   match(Set dst (SubVS src (LoadVector mem)));
7263   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7264   ins_encode %{
7265     int vector_len = 0;
7266     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7267   %}
7268   ins_pipe( pipe_slow );
7269 %}
7270 
7271 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
7272   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7273   match(Set dst (SubVS src (LoadVector mem)));
7274   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7275   ins_encode %{
7276     int vector_len = 0;
7277     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7278   %}
7279   ins_pipe( pipe_slow );
7280 %}
7281 
7282 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7283   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7284   match(Set dst (SubVS dst (LoadVector mem)));
7285   effect(TEMP src);
7286   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7287   ins_encode %{
7288     int vector_len = 0;
7289     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7290   %}
7291   ins_pipe( pipe_slow );
7292 %}
7293 
7294 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7295   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7296   match(Set dst (SubVS src1 src2));
7297   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7298   ins_encode %{
7299     int vector_len = 1;
7300     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7301   %}
7302   ins_pipe( pipe_slow );
7303 %}
7304 
7305 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7306   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7307   match(Set dst (SubVS src1 src2));
7308   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7309   ins_encode %{
7310     int vector_len = 1;
7311     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7312   %}
7313   ins_pipe( pipe_slow );
7314 %}
7315 
7316 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7317   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7318   match(Set dst (SubVS dst src2));
7319   effect(TEMP src1);
7320   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7321   ins_encode %{
7322     int vector_len = 1;
7323     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7324   %}
7325   ins_pipe( pipe_slow );
7326 %}
7327 
7328 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7329   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7330   match(Set dst (SubVS src (LoadVector mem)));
7331   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7332   ins_encode %{
7333     int vector_len = 1;
7334     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7335   %}
7336   ins_pipe( pipe_slow );
7337 %}
7338 
7339 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7340   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7341   match(Set dst (SubVS src (LoadVector mem)));
7342   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7343   ins_encode %{
7344     int vector_len = 1;
7345     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7346   %}
7347   ins_pipe( pipe_slow );
7348 %}
7349 
7350 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7351   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7352   match(Set dst (SubVS dst (LoadVector mem)));
7353    effect(TEMP src);
7354   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7355   ins_encode %{
7356     int vector_len = 1;
7357     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7358   %}
7359   ins_pipe( pipe_slow );
7360 %}
7361 
7362 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7363   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7364   match(Set dst (SubVS src1 src2));
7365   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7366   ins_encode %{
7367     int vector_len = 2;
7368     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7369   %}
7370   ins_pipe( pipe_slow );
7371 %}
7372 
7373 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7374   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7375   match(Set dst (SubVS src (LoadVector mem)));
7376   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7377   ins_encode %{
7378     int vector_len = 2;
7379     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7380   %}
7381   ins_pipe( pipe_slow );
7382 %}
7383 
7384 // Integers vector sub
7385 instruct vsub2I(vecD dst, vecD src) %{
7386   predicate(n->as_Vector()->length() == 2);
7387   match(Set dst (SubVI dst src));
7388   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7389   ins_encode %{
7390     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7391   %}
7392   ins_pipe( pipe_slow );
7393 %}
7394 


7703   match(Set dst (SubVD src (LoadVector mem)));
7704   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7705   ins_encode %{
7706     int vector_len = 0;
7707     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7708   %}
7709   ins_pipe( pipe_slow );
7710 %}
7711 
7712 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7713   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7714   match(Set dst (SubVD src1 src2));
7715   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7716   ins_encode %{
7717     int vector_len = 1;
7718     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7719   %}
7720   ins_pipe( pipe_slow );
7721 %}
7722 
7723 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7724   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7725   match(Set dst (SubVD src (LoadVector mem)));
7726   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7727   ins_encode %{
7728     int vector_len = 1;
7729     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7730   %}
7731   ins_pipe( pipe_slow );
7732 %}
7733 
7734 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7735   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7736   match(Set dst (SubVD src1 src2));
7737   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7738   ins_encode %{
7739     int vector_len = 2;
7740     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7741   %}
7742   ins_pipe( pipe_slow );
7743 %}
7744 
7745 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7746   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7747   match(Set dst (SubVD src (LoadVector mem)));
7748   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7749   ins_encode %{
7750     int vector_len = 2;
7751     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7752   %}
7753   ins_pipe( pipe_slow );
7754 %}
7755 
7756 // --------------------------------- MUL --------------------------------------
7757 
7758 // Shorts/Chars vector mul
7759 instruct vmul2S(vecS dst, vecS src) %{
7760   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7761   match(Set dst (MulVS dst src));
7762   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7763   ins_encode %{
7764     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7765   %}
7766   ins_pipe( pipe_slow );
7767 %}
7768 
7769 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7770   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7771   match(Set dst (MulVS src1 src2));
7772   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7773   ins_encode %{
7774     int vector_len = 0;
7775     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7776   %}
7777   ins_pipe( pipe_slow );
7778 %}
7779 
7780 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7781   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7782   match(Set dst (MulVS src1 src2));
7783   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7784   ins_encode %{
7785     int vector_len = 0;
7786     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7787   %}
7788   ins_pipe( pipe_slow );
7789 %}
7790 
7791 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7792   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7793   match(Set dst (MulVS dst src2));
7794   effect(TEMP src1);
7795   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7796   ins_encode %{
7797     int vector_len = 0;
7798     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7799   %}
7800   ins_pipe( pipe_slow );
7801 %}
7802 
7803 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7804   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7805   match(Set dst (MulVS src (LoadVector mem)));
7806   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7807   ins_encode %{
7808     int vector_len = 0;
7809     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7810   %}
7811   ins_pipe( pipe_slow );
7812 %}
7813 
7814 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7815   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7816   match(Set dst (MulVS src (LoadVector mem)));
7817   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7818   ins_encode %{
7819     int vector_len = 0;
7820     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7821   %}
7822   ins_pipe( pipe_slow );
7823 %}
7824 
7825 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7826   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7827   match(Set dst (MulVS dst (LoadVector mem)));
7828   effect(TEMP src);
7829   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7830   ins_encode %{
7831     int vector_len = 0;
7832     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7833   %}
7834   ins_pipe( pipe_slow );
7835 %}
7836 
7837 instruct vmul4S(vecD dst, vecD src) %{
7838   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7839   match(Set dst (MulVS dst src));
7840   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7841   ins_encode %{
7842     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7843   %}
7844   ins_pipe( pipe_slow );
7845 %}
7846 
7847 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7848   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7849   match(Set dst (MulVS src1 src2));
7850   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7851   ins_encode %{
7852     int vector_len = 0;
7853     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7854   %}
7855   ins_pipe( pipe_slow );
7856 %}
7857 
7858 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7859   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7860   match(Set dst (MulVS src1 src2));
7861   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7862   ins_encode %{
7863     int vector_len = 0;
7864     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7865   %}
7866   ins_pipe( pipe_slow );
7867 %}
7868 
7869 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7870   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7871   match(Set dst (MulVS dst src2));
7872   effect(TEMP src1);
7873   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7874   ins_encode %{
7875     int vector_len = 0;
7876     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7877   %}
7878   ins_pipe( pipe_slow );
7879 %}
7880 
7881 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7882   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7883   match(Set dst (MulVS src (LoadVector mem)));
7884   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7885   ins_encode %{
7886     int vector_len = 0;
7887     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7888   %}
7889   ins_pipe( pipe_slow );
7890 %}
7891 
7892 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7893   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7894   match(Set dst (MulVS src (LoadVector mem)));
7895   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7896   ins_encode %{
7897     int vector_len = 0;
7898     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7899   %}
7900   ins_pipe( pipe_slow );
7901 %}
7902 
7903 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7904   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7905   match(Set dst (MulVS dst (LoadVector mem)));
7906   effect(TEMP src);
7907   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7908   ins_encode %{
7909     int vector_len = 0;
7910     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7911   %}
7912   ins_pipe( pipe_slow );
7913 %}
7914 
7915 instruct vmul8S(vecX dst, vecX src) %{
7916   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);



7917   match(Set dst (MulVS dst src));
7918   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7919   ins_encode %{
7920     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7921   %}
7922   ins_pipe( pipe_slow );
7923 %}
7924 
7925 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7926   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7927   match(Set dst (MulVS src1 src2));
7928   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7929   ins_encode %{
7930     int vector_len = 0;
7931     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7932   %}
7933   ins_pipe( pipe_slow );
7934 %}
7935 
7936 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7937   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7938   match(Set dst (MulVS src1 src2));
7939   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7940   ins_encode %{
7941     int vector_len = 0;
7942     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7943   %}
7944   ins_pipe( pipe_slow );
7945 %}
7946 
7947 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7948   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7949   match(Set dst (MulVS dst src2));
7950   effect(TEMP src1);
7951   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7952   ins_encode %{
7953     int vector_len = 0;
7954     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7955   %}
7956   ins_pipe( pipe_slow );
7957 %}
7958 
7959 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7960   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7961   match(Set dst (MulVS src (LoadVector mem)));
7962   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7963   ins_encode %{
7964     int vector_len = 0;
7965     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7966   %}
7967   ins_pipe( pipe_slow );
7968 %}
7969 
7970 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7971   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7972   match(Set dst (MulVS src (LoadVector mem)));
7973   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7974   ins_encode %{
7975     int vector_len = 0;
7976     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7977   %}
7978   ins_pipe( pipe_slow );
7979 %}
7980 
7981 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7982   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7983   match(Set dst (MulVS dst (LoadVector mem)));
7984   effect(TEMP src);
7985   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7986   ins_encode %{
7987     int vector_len = 0;
7988     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7989   %}
7990   ins_pipe( pipe_slow );
7991 %}
7992 
7993 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7994   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7995   match(Set dst (MulVS src1 src2));
7996   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7997   ins_encode %{
7998     int vector_len = 1;
7999     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8000   %}
8001   ins_pipe( pipe_slow );
8002 %}
8003 
8004 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
8005   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8006   match(Set dst (MulVS src1 src2));
8007   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8008   ins_encode %{
8009     int vector_len = 1;
8010     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8011   %}
8012   ins_pipe( pipe_slow );
8013 %}
8014 
8015 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
8016   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8017   match(Set dst (MulVS dst src2));
8018   effect(TEMP src1);
8019   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8020   ins_encode %{
8021     int vector_len = 1;
8022     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8023   %}
8024   ins_pipe( pipe_slow );
8025 %}
8026 
8027 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
8028   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8029   match(Set dst (MulVS src (LoadVector mem)));
8030   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8031   ins_encode %{
8032     int vector_len = 1;
8033     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8034   %}
8035   ins_pipe( pipe_slow );
8036 %}
8037 
8038 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
8039   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8040   match(Set dst (MulVS src (LoadVector mem)));
8041   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8042   ins_encode %{
8043     int vector_len = 1;
8044     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8045   %}
8046   ins_pipe( pipe_slow );
8047 %}
8048 
8049 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
8050   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8051   match(Set dst (MulVS dst (LoadVector mem)));
8052   effect(TEMP src);
8053   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8054   ins_encode %{
8055     int vector_len = 1;
8056     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8057   %}
8058   ins_pipe( pipe_slow );
8059 %}
8060 
8061 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
8062   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8063   match(Set dst (MulVS src1 src2));
8064   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
8065   ins_encode %{
8066     int vector_len = 2;
8067     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8068   %}
8069   ins_pipe( pipe_slow );
8070 %}
8071 
8072 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
8073   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8074   match(Set dst (MulVS src (LoadVector mem)));
8075   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
8076   ins_encode %{
8077     int vector_len = 2;
8078     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8079   %}
8080   ins_pipe( pipe_slow );
8081 %}
8082 
8083 // Integers vector mul (sse4_1)
8084 instruct vmul2I(vecD dst, vecD src) %{
8085   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
8086   match(Set dst (MulVI dst src));
8087   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
8088   ins_encode %{
8089     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
8090   %}
8091   ins_pipe( pipe_slow );
8092 %}
8093 


8683   %}
8684   ins_pipe( pipe_slow );
8685 %}
8686 
8687 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8688   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8689   match(Set dst (SqrtVD src));
8690   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8691   ins_encode %{
8692     int vector_len = 1;
8693     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8694   %}
8695   ins_pipe( pipe_slow );
8696 %}
8697 
8698 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8699   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8700   match(Set dst (SqrtVD (LoadVector mem)));
8701   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8702   ins_encode %{
8703     int vector_len = 1;
8704     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8705   %}
8706   ins_pipe( pipe_slow );
8707 %}
8708 
8709 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8710   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8711   match(Set dst (SqrtVD src));
8712   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8713   ins_encode %{
8714     int vector_len = 2;
8715     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8716   %}
8717   ins_pipe( pipe_slow );
8718 %}
8719 
8720 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8721   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8722   match(Set dst (SqrtVD (LoadVector mem)));
8723   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8724   ins_encode %{
8725     int vector_len = 2;
8726     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8727   %}
8728   ins_pipe( pipe_slow );
8729 %}
8730 
8731 // ------------------------------ LeftShift -----------------------------------
8732 
8733 // Shorts/Chars vector left shift
8734 instruct vsll2S(vecS dst, vecS shift) %{
8735   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8736   match(Set dst (LShiftVS dst shift));
8737   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8738   ins_encode %{
8739     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8740   %}
8741   ins_pipe( pipe_slow );
8742 %}
8743 
8744 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8745   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8746   match(Set dst (LShiftVS dst shift));
8747   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8748   ins_encode %{
8749     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8750   %}
8751   ins_pipe( pipe_slow );
8752 %}
8753 
8754 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8755   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8756   match(Set dst (LShiftVS src shift));
8757   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8758   ins_encode %{
8759     int vector_len = 0;
8760     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8761   %}
8762   ins_pipe( pipe_slow );
8763 %}
8764 
8765 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8766   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8767   match(Set dst (LShiftVS src shift));
8768   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8769   ins_encode %{
8770     int vector_len = 0;
8771     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8772   %}
8773   ins_pipe( pipe_slow );
8774 %}
8775 
8776 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8777   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8778   match(Set dst (LShiftVS dst shift));
8779   effect(TEMP src);
8780   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8781   ins_encode %{
8782     int vector_len = 0;
8783     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8784   %}
8785   ins_pipe( pipe_slow );
8786 %}
8787 
8788 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8789   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8790   match(Set dst (LShiftVS src shift));
8791   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8792   ins_encode %{
8793     int vector_len = 0;
8794     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8795   %}
8796   ins_pipe( pipe_slow );
8797 %}
8798 
8799 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8800   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8801   match(Set dst (LShiftVS src shift));
8802   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8803   ins_encode %{
8804     int vector_len = 0;
8805     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8806   %}
8807   ins_pipe( pipe_slow );
8808 %}
8809 
8810 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8811   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8812   match(Set dst (LShiftVS dst shift));
8813   effect(TEMP src);
8814   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8815   ins_encode %{
8816     int vector_len = 0;
8817     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8818   %}
8819   ins_pipe( pipe_slow );
8820 %}
8821 
8822 instruct vsll4S(vecD dst, vecS shift) %{
8823   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8824   match(Set dst (LShiftVS dst shift));
8825   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8826   ins_encode %{
8827     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8828   %}
8829   ins_pipe( pipe_slow );
8830 %}
8831 
8832 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8833   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8834   match(Set dst (LShiftVS dst shift));
8835   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8836   ins_encode %{
8837     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8838   %}
8839   ins_pipe( pipe_slow );
8840 %}
8841 
8842 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8843   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8844   match(Set dst (LShiftVS src shift));
8845   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8846   ins_encode %{
8847     int vector_len = 0;
8848     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8849   %}
8850   ins_pipe( pipe_slow );
8851 %}
8852 
8853 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8854   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8855   match(Set dst (LShiftVS src shift));
8856   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8857   ins_encode %{
8858     int vector_len = 0;
8859     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8860   %}
8861   ins_pipe( pipe_slow );
8862 %}
8863 
8864 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8865   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8866   match(Set dst (LShiftVS dst shift));
8867   effect(TEMP src);
8868   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8869   ins_encode %{
8870     int vector_len = 0;
8871     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8872   %}
8873   ins_pipe( pipe_slow );
8874 %}
8875 
8876 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8877   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8878   match(Set dst (LShiftVS src shift));
8879   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8880   ins_encode %{
8881     int vector_len = 0;
8882     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8883   %}
8884   ins_pipe( pipe_slow );
8885 %}
8886 
8887 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8888   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8889   match(Set dst (LShiftVS src shift));
8890   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8891   ins_encode %{
8892     int vector_len = 0;
8893     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8894   %}
8895   ins_pipe( pipe_slow );
8896 %}
8897 
8898 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8899   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8900   match(Set dst (LShiftVS dst shift));
8901   effect(TEMP src);
8902   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8903   ins_encode %{
8904     int vector_len = 0;
8905     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8906   %}
8907   ins_pipe( pipe_slow );
8908 %}
8909 
8910 instruct vsll8S(vecX dst, vecS shift) %{
8911   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);



8912   match(Set dst (LShiftVS dst shift));
8913   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8914   ins_encode %{
8915     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8916   %}
8917   ins_pipe( pipe_slow );
8918 %}
8919 
8920 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8921   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8922   match(Set dst (LShiftVS dst shift));
8923   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8924   ins_encode %{
8925     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8926   %}
8927   ins_pipe( pipe_slow );
8928 %}
8929 
8930 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8931   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8932   match(Set dst (LShiftVS src shift));
8933   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8934   ins_encode %{
8935     int vector_len = 0;
8936     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8937   %}
8938   ins_pipe( pipe_slow );
8939 %}
8940 
8941 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8942   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8943   match(Set dst (LShiftVS src shift));
8944   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8945   ins_encode %{
8946     int vector_len = 0;
8947     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8948   %}
8949   ins_pipe( pipe_slow );
8950 %}
8951 
8952 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8953   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8954   match(Set dst (LShiftVS dst shift));
8955   effect(TEMP src);
8956   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8957   ins_encode %{
8958     int vector_len = 0;
8959     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8960   %}
8961   ins_pipe( pipe_slow );
8962 %}
8963 
8964 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8965   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8966   match(Set dst (LShiftVS src shift));
8967   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8968   ins_encode %{
8969     int vector_len = 0;
8970     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8971   %}
8972   ins_pipe( pipe_slow );
8973 %}
8974 
8975 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8976   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8977   match(Set dst (LShiftVS src shift));
8978   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8979   ins_encode %{
8980     int vector_len = 0;
8981     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8982   %}
8983   ins_pipe( pipe_slow );
8984 %}
8985 
8986 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8987   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8988   match(Set dst (LShiftVS dst shift));
8989   effect(TEMP src);
8990   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8991   ins_encode %{
8992     int vector_len = 0;
8993     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8994   %}
8995   ins_pipe( pipe_slow );
8996 %}
8997 
8998 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
8999   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9000   match(Set dst (LShiftVS src shift));
9001   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9002   ins_encode %{
9003     int vector_len = 1;
9004     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9005   %}
9006   ins_pipe( pipe_slow );
9007 %}
9008 
9009 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9010   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9011   match(Set dst (LShiftVS src shift));
9012   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9013   ins_encode %{
9014     int vector_len = 1;
9015     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9016   %}
9017   ins_pipe( pipe_slow );
9018 %}
9019 
9020 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9021   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9022   match(Set dst (LShiftVS dst shift));
9023   effect(TEMP src);
9024   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9025   ins_encode %{
9026     int vector_len = 1;
9027     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9028   %}
9029   ins_pipe( pipe_slow );
9030 %}
9031 
9032 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9033   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9034   match(Set dst (LShiftVS src shift));
9035   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9036   ins_encode %{
9037     int vector_len = 1;
9038     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9039   %}
9040   ins_pipe( pipe_slow );
9041 %}
9042 
9043 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9044   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9045   match(Set dst (LShiftVS src shift));
9046   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9047   ins_encode %{
9048     int vector_len = 1;
9049     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9050   %}
9051   ins_pipe( pipe_slow );
9052 %}
9053 
9054 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9055   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9056   match(Set dst (LShiftVS dst shift));
9057   effect(TEMP src);
9058   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9059   ins_encode %{
9060     int vector_len = 1;
9061     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9062   %}
9063   ins_pipe( pipe_slow );
9064 %}
9065 
9066 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
9067   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9068   match(Set dst (LShiftVS src shift));
9069   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9070   ins_encode %{
9071     int vector_len = 2;
9072     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9073   %}
9074   ins_pipe( pipe_slow );
9075 %}
9076 
9077 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9078   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9079   match(Set dst (LShiftVS src shift));
9080   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9081   ins_encode %{
9082     int vector_len = 2;
9083     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9084   %}
9085   ins_pipe( pipe_slow );
9086 %}
9087 
9088 // Integers vector left shift
9089 instruct vsll2I(vecD dst, vecS shift) %{
9090   predicate(n->as_Vector()->length() == 2);
9091   match(Set dst (LShiftVI dst shift));
9092   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9093   ins_encode %{
9094     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9095   %}
9096   ins_pipe( pipe_slow );
9097 %}
9098 


9267   %}
9268   ins_pipe( pipe_slow );
9269 %}
9270 
9271 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9272   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9273   match(Set dst (LShiftVL src shift));
9274   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9275   ins_encode %{
9276     int vector_len = 1;
9277     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9278   %}
9279   ins_pipe( pipe_slow );
9280 %}
9281 
9282 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9283   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9284   match(Set dst (LShiftVL src shift));
9285   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9286   ins_encode %{
9287     int vector_len = 2;
9288     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9289   %}
9290   ins_pipe( pipe_slow );
9291 %}
9292 
9293 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9294   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9295   match(Set dst (LShiftVL src shift));
9296   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9297   ins_encode %{
9298     int vector_len = 2;
9299     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9300   %}
9301   ins_pipe( pipe_slow );
9302 %}
9303 
9304 // ----------------------- LogicalRightShift -----------------------------------
9305 
9306 // Shorts vector logical right shift produces incorrect Java result
9307 // for negative data because java code convert short value into int with
9308 // sign extension before a shift. But char vectors are fine since chars are
9309 // unsigned values.
9310 
9311 instruct vsrl2S(vecS dst, vecS shift) %{
9312   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9313   match(Set dst (URShiftVS dst shift));
9314   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9315   ins_encode %{
9316     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9317   %}
9318   ins_pipe( pipe_slow );
9319 %}
9320 
9321 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9322   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9323   match(Set dst (URShiftVS dst shift));
9324   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9325   ins_encode %{
9326     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9327   %}
9328   ins_pipe( pipe_slow );
9329 %}
9330 
9331 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9332   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9333   match(Set dst (URShiftVS src shift));
9334   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9335   ins_encode %{
9336     int vector_len = 0;
9337     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9338   %}
9339   ins_pipe( pipe_slow );
9340 %}
9341 
9342 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9343   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9344   match(Set dst (URShiftVS src shift));
9345   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9346   ins_encode %{
9347     int vector_len = 0;
9348     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9349   %}
9350   ins_pipe( pipe_slow );
9351 %}
9352 
9353 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9354   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9355   match(Set dst (URShiftVS dst shift));
9356   effect(TEMP src);
9357   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9358   ins_encode %{
9359     int vector_len = 0;
9360     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9361   %}
9362   ins_pipe( pipe_slow );
9363 %}
9364 
9365 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9366   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9367   match(Set dst (URShiftVS src shift));
9368   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9369   ins_encode %{
9370     int vector_len = 0;
9371     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9372   %}
9373   ins_pipe( pipe_slow );
9374 %}
9375 
9376 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9377   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9378   match(Set dst (URShiftVS src shift));
9379   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9380   ins_encode %{
9381     int vector_len = 0;
9382     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9383   %}
9384   ins_pipe( pipe_slow );
9385 %}
9386 
9387 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9388   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9389   match(Set dst (URShiftVS dst shift));
9390   effect(TEMP src);
9391   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9392   ins_encode %{
9393     int vector_len = 0;
9394     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9395   %}
9396   ins_pipe( pipe_slow );
9397 %}
9398 
9399 instruct vsrl4S(vecD dst, vecS shift) %{
9400   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9401   match(Set dst (URShiftVS dst shift));
9402   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9403   ins_encode %{
9404     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9405   %}
9406   ins_pipe( pipe_slow );
9407 %}
9408 
9409 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9410   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9411   match(Set dst (URShiftVS dst shift));
9412   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9413   ins_encode %{
9414     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9415   %}
9416   ins_pipe( pipe_slow );
9417 %}
9418 
9419 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9420   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9421   match(Set dst (URShiftVS src shift));
9422   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9423   ins_encode %{
9424     int vector_len = 0;
9425     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9426   %}
9427   ins_pipe( pipe_slow );
9428 %}
9429 
9430 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9431   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9432   match(Set dst (URShiftVS src shift));
9433   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9434   ins_encode %{
9435     int vector_len = 0;
9436     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9437   %}
9438   ins_pipe( pipe_slow );
9439 %}
9440 
9441 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9442   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9443   match(Set dst (URShiftVS dst shift));
9444   effect(TEMP src);
9445   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9446   ins_encode %{
9447     int vector_len = 0;
9448     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9449   %}
9450   ins_pipe( pipe_slow );
9451 %}
9452 
9453 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9454   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9455   match(Set dst (URShiftVS src shift));
9456   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9457   ins_encode %{
9458     int vector_len = 0;
9459     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9460   %}
9461   ins_pipe( pipe_slow );
9462 %}
9463 
9464 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9465   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9466   match(Set dst (URShiftVS src shift));
9467   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9468   ins_encode %{
9469     int vector_len = 0;
9470     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9471   %}
9472   ins_pipe( pipe_slow );
9473 %}
9474 
9475 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9476   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9477   match(Set dst (URShiftVS dst shift));
9478   effect(TEMP src);
9479   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9480   ins_encode %{
9481     int vector_len = 0;
9482     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9483   %}
9484   ins_pipe( pipe_slow );
9485 %}
9486 
9487 instruct vsrl8S(vecX dst, vecS shift) %{
9488   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);







9489   match(Set dst (URShiftVS dst shift));
9490   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9491   ins_encode %{
9492     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9493   %}
9494   ins_pipe( pipe_slow );
9495 %}
9496 
9497 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9498   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9499   match(Set dst (URShiftVS dst shift));
9500   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9501   ins_encode %{
9502     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9503   %}
9504   ins_pipe( pipe_slow );
9505 %}
9506 
9507 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9508   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9509   match(Set dst (URShiftVS src shift));
9510   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9511   ins_encode %{
9512     int vector_len = 0;
9513     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9514   %}
9515   ins_pipe( pipe_slow );
9516 %}
9517 
9518 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9519   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9520   match(Set dst (URShiftVS src shift));
9521   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9522   ins_encode %{
9523     int vector_len = 0;
9524     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9525   %}
9526   ins_pipe( pipe_slow );
9527 %}
9528 
9529 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9530   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9531   match(Set dst (URShiftVS dst shift));
9532   effect(TEMP src);
9533   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9534   ins_encode %{
9535     int vector_len = 0;
9536     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9537   %}
9538   ins_pipe( pipe_slow );
9539 %}
9540 
9541 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9542   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9543   match(Set dst (URShiftVS src shift));
9544   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9545   ins_encode %{
9546     int vector_len = 0;
9547     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9548   %}
9549   ins_pipe( pipe_slow );
9550 %}
9551 
9552 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9553   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9554   match(Set dst (URShiftVS src shift));
9555   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9556   ins_encode %{
9557     int vector_len = 0;
9558     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9559   %}
9560   ins_pipe( pipe_slow );
9561 %}
9562 
9563 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9564   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9565   match(Set dst (URShiftVS dst shift));
9566   effect(TEMP src);
9567   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9568   ins_encode %{
9569     int vector_len = 0;
9570     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9571   %}
9572   ins_pipe( pipe_slow );
9573 %}
9574 
9575 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9576   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9577   match(Set dst (URShiftVS src shift));
9578   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9579   ins_encode %{
9580     int vector_len = 1;
9581     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9582   %}
9583   ins_pipe( pipe_slow );
9584 %}
9585 
9586 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9587   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9588   match(Set dst (URShiftVS src shift));
9589   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9590   ins_encode %{
9591     int vector_len = 1;
9592     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9593   %}
9594   ins_pipe( pipe_slow );
9595 %}
9596 
9597 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9598   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9599   match(Set dst (URShiftVS dst shift));
9600   effect(TEMP src);
9601   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9602   ins_encode %{
9603     int vector_len = 1;
9604     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9605   %}
9606   ins_pipe( pipe_slow );
9607 %}
9608 
9609 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9610   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9611   match(Set dst (URShiftVS src shift));
9612   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9613   ins_encode %{
9614     int vector_len = 1;
9615     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9621   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9622   match(Set dst (URShiftVS src shift));
9623   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9624   ins_encode %{
9625     int vector_len = 1;
9626     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9627   %}
9628   ins_pipe( pipe_slow );
9629 %}
9630 
9631 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9632   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9633   match(Set dst (URShiftVS dst shift));
9634   effect(TEMP src);
9635   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9636   ins_encode %{
9637     int vector_len = 1;
9638     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9639   %}
9640   ins_pipe( pipe_slow );
9641 %}
9642 
9643 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9644   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9645   match(Set dst (URShiftVS src shift));
9646   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9647   ins_encode %{
9648     int vector_len = 2;
9649     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9650   %}
9651   ins_pipe( pipe_slow );
9652 %}
9653 
9654 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9655   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9656   match(Set dst (URShiftVS src shift));
9657   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9658   ins_encode %{
9659     int vector_len = 2;
9660     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9661   %}
9662   ins_pipe( pipe_slow );
9663 %}
9664 
9665 // Integers vector logical right shift
9666 instruct vsrl2I(vecD dst, vecS shift) %{
9667   predicate(n->as_Vector()->length() == 2);
9668   match(Set dst (URShiftVI dst shift));
9669   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9670   ins_encode %{
9671     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9672   %}
9673   ins_pipe( pipe_slow );
9674 %}
9675 


9865     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9866   %}
9867   ins_pipe( pipe_slow );
9868 %}
9869 
9870 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9871   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9872   match(Set dst (URShiftVL src shift));
9873   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9874   ins_encode %{
9875     int vector_len = 2;
9876     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9877   %}
9878   ins_pipe( pipe_slow );
9879 %}
9880 
9881 // ------------------- ArithmeticRightShift -----------------------------------
9882 
9883 // Shorts/Chars vector arithmetic right shift
9884 instruct vsra2S(vecS dst, vecS shift) %{
9885   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9886   match(Set dst (RShiftVS dst shift));
9887   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9888   ins_encode %{
9889     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9890   %}
9891   ins_pipe( pipe_slow );
9892 %}
9893 
9894 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9895   predicate(n->as_Vector()->length() == 2);
9896   match(Set dst (RShiftVS dst shift));
9897   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9898   ins_encode %{
9899     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9900   %}
9901   ins_pipe( pipe_slow );
9902 %}
9903 
9904 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9905   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9906   match(Set dst (RShiftVS src shift));
9907   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9908   ins_encode %{
9909     int vector_len = 0;
9910     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9911   %}
9912   ins_pipe( pipe_slow );
9913 %}
9914 
9915 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9916   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9917   match(Set dst (RShiftVS src shift));
9918   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9919   ins_encode %{
9920     int vector_len = 0;
9921     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9922   %}
9923   ins_pipe( pipe_slow );
9924 %}
9925 
9926 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9927   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9928   match(Set dst (RShiftVS dst shift));
9929   effect(TEMP src);
9930   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9931   ins_encode %{
9932     int vector_len = 0;
9933     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9934   %}
9935   ins_pipe( pipe_slow );
9936 %}
9937 
9938 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9939   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9940   match(Set dst (RShiftVS src shift));
9941   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9942   ins_encode %{
9943     int vector_len = 0;
9944     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9945   %}
9946   ins_pipe( pipe_slow );
9947 %}
9948 
9949 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9950   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9951   match(Set dst (RShiftVS src shift));
9952   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9953   ins_encode %{
9954     int vector_len = 0;
9955     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9956   %}
9957   ins_pipe( pipe_slow );
9958 %}
9959 
9960 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9961   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9962   match(Set dst (RShiftVS dst shift));
9963   effect(TEMP src);
9964   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9965   ins_encode %{
9966     int vector_len = 0;
9967     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9968   %}
9969   ins_pipe( pipe_slow );
9970 %}
9971 
9972 instruct vsra4S(vecD dst, vecS shift) %{
9973   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9974   match(Set dst (RShiftVS dst shift));
9975   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9976   ins_encode %{
9977     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9978   %}
9979   ins_pipe( pipe_slow );
9980 %}
9981 
9982 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9983   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9984   match(Set dst (RShiftVS dst shift));
9985   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9986   ins_encode %{
9987     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9988   %}
9989   ins_pipe( pipe_slow );
9990 %}
9991 
9992 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9993   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9994   match(Set dst (RShiftVS src shift));
9995   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9996   ins_encode %{
9997     int vector_len = 0;
9998     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9999   %}
10000   ins_pipe( pipe_slow );
10001 %}
10002 
10003 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
10004   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10005   match(Set dst (RShiftVS src shift));
10006   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10007   ins_encode %{
10008     int vector_len = 0;
10009     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10010   %}
10011   ins_pipe( pipe_slow );
10012 %}
10013 
10014 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
10015   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10016   match(Set dst (RShiftVS dst shift));
10017   effect(TEMP src);
10018   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10019   ins_encode %{
10020     int vector_len = 0;
10021     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10022   %}
10023   ins_pipe( pipe_slow );
10024 %}
10025 
10026 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
10027   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
10028   match(Set dst (RShiftVS src shift));
10029   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10030   ins_encode %{
10031     int vector_len = 0;
10032     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10033   %}
10034   ins_pipe( pipe_slow );
10035 %}
10036 
10037 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
10038   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10039   match(Set dst (RShiftVS src shift));
10040   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10041   ins_encode %{
10042     int vector_len = 0;
10043     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
10049   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10050   match(Set dst (RShiftVS dst shift));
10051   effect(TEMP src);
10052   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10053   ins_encode %{
10054     int vector_len = 0;
10055     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10056   %}
10057   ins_pipe( pipe_slow );
10058 %}
10059 
10060 instruct vsra8S(vecX dst, vecS shift) %{
10061   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10062   match(Set dst (RShiftVS dst shift));
10063   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10064   ins_encode %{
10065     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10066   %}
10067   ins_pipe( pipe_slow );
10068 %}
10069 
10070 instruct vsra8S_imm(vecX dst, immI8 shift) %{
10071   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10072   match(Set dst (RShiftVS dst shift));
10073   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10074   ins_encode %{
10075     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10076   %}
10077   ins_pipe( pipe_slow );
10078 %}
10079 
10080 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
10081   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10082   match(Set dst (RShiftVS src shift));
10083   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10084   ins_encode %{
10085     int vector_len = 0;
10086     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10087   %}
10088   ins_pipe( pipe_slow );
10089 %}
10090 
10091 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
10092   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10093   match(Set dst (RShiftVS src shift));
10094   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10095   ins_encode %{
10096     int vector_len = 0;
10097     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10098   %}
10099   ins_pipe( pipe_slow );
10100 %}
10101 
10102 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
10103   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10104   match(Set dst (RShiftVS dst shift));
10105   effect(TEMP src);
10106   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10107   ins_encode %{
10108     int vector_len = 0;
10109     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10110   %}
10111   ins_pipe( pipe_slow );
10112 %}
10113 
10114 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
10115   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10116   match(Set dst (RShiftVS src shift));
10117   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10118   ins_encode %{
10119     int vector_len = 0;
10120     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10121   %}
10122   ins_pipe( pipe_slow );
10123 %}
10124 
10125 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
10126   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10127   match(Set dst (RShiftVS src shift));
10128   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10129   ins_encode %{
10130     int vector_len = 0;
10131     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10132   %}
10133   ins_pipe( pipe_slow );
10134 %}
10135 
10136 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
10137   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10138   match(Set dst (RShiftVS dst shift));
10139   effect(TEMP src);
10140   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10141   ins_encode %{
10142     int vector_len = 0;
10143     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10144   %}
10145   ins_pipe( pipe_slow );
10146 %}
10147 
10148 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
10149   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10150   match(Set dst (RShiftVS src shift));
10151   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10152   ins_encode %{
10153     int vector_len = 1;
10154     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10155   %}
10156   ins_pipe( pipe_slow );
10157 %}
10158 
10159 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
10160   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10161   match(Set dst (RShiftVS src shift));
10162   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10163   ins_encode %{
10164     int vector_len = 1;
10165     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10166   %}
10167   ins_pipe( pipe_slow );
10168 %}
10169 
10170 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10171   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10172   match(Set dst (RShiftVS dst shift));
10173   effect(TEMP src);
10174   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10175   ins_encode %{
10176     int vector_len = 1;
10177     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10178   %}
10179   ins_pipe( pipe_slow );
10180 %}
10181 
10182 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10183   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
10184   match(Set dst (RShiftVS src shift));
10185   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10186   ins_encode %{
10187     int vector_len = 1;
10188     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10189   %}
10190   ins_pipe( pipe_slow );
10191 %}
10192 
10193 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10194   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10195   match(Set dst (RShiftVS src shift));
10196   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10197   ins_encode %{
10198     int vector_len = 1;
10199     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10200   %}
10201   ins_pipe( pipe_slow );
10202 %}
10203 
10204 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10205   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10206   match(Set dst (RShiftVS dst shift));
10207   effect(TEMP src);
10208   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10209   ins_encode %{
10210     int vector_len = 1;
10211     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10212   %}
10213   ins_pipe( pipe_slow );
10214 %}
10215 
10216 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
10217   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10218   match(Set dst (RShiftVS src shift));
10219   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10220   ins_encode %{
10221     int vector_len = 2;
10222     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10223   %}
10224   ins_pipe( pipe_slow );
10225 %}
10226 
10227 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10228   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10229   match(Set dst (RShiftVS src shift));
10230   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10231   ins_encode %{
10232     int vector_len = 2;
10233     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10234   %}
10235   ins_pipe( pipe_slow );
10236 %}
10237 
10238 // Integers vector arithmetic right shift
10239 instruct vsra2I(vecD dst, vecS shift) %{
10240   predicate(n->as_Vector()->length() == 2);
10241   match(Set dst (RShiftVI dst shift));
10242   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10243   ins_encode %{
10244     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10245   %}
10246   ins_pipe( pipe_slow );
10247 %}
10248 


< prev index next >