< prev index next >

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Print this page
rev 61241 : manual merge with vectorIntrinsics


  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 















  36 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  37   guarantee(PostLoopMultiversioning, "must be");
  38   Assembler::movl(dst, 1);
  39   Assembler::shlxl(dst, dst, src);
  40   Assembler::decl(dst);
  41   Assembler::kmovdl(k1, dst);
  42   Assembler::movl(dst, src);
  43 }
  44 
  45 void C2_MacroAssembler::restorevectmask() {
  46   guarantee(PostLoopMultiversioning, "must be");
  47   Assembler::knotwl(k1, k0);
  48 }
  49 
  50 #if INCLUDE_RTM_OPT
  51 
  52 // Update rtm_counters based on abort status
  53 // input: abort_status
  54 //        rtm_counters (RTMLockingCounters*)
  55 // flags are killed


 837   if (dst != src) {
 838     movdqu(dst, src);
 839   }
 840   if (opcode == Op_AbsVF) {
 841     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 842   } else {
 843     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 844     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 849   if (opcode == Op_AbsVF) {
 850     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 851   } else {
 852     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 853     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 854   }
 855 }
 856 

































































































































































 857 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
 858   if (sign) {
 859     pmovsxbw(dst, src);
 860   } else {
 861     pmovzxbw(dst, src);
 862   }
 863 }
 864 
 865 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
 866   if (sign) {
 867     vpmovsxbw(dst, src, vector_len);
 868   } else {
 869     vpmovzxbw(dst, src, vector_len);
 870   }
 871 }
 872 
 873 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
 874   if (opcode == Op_RShiftVI) {
 875     psrad(dst, src);
 876   } else if (opcode == Op_LShiftVI) {
 877     pslld(dst, src);
 878   } else {
 879     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
 880     psrld(dst, src);
 881   }
 882 }
 883 
 884 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 885   if (opcode == Op_RShiftVI) {
 886     vpsrad(dst, nds, src, vector_len);
 887   } else if (opcode == Op_LShiftVI) {
 888     vpslld(dst, nds, src, vector_len);
 889   } else {
 890     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
 891     vpsrld(dst, nds, src, vector_len);
 892   }
 893 }
 894 
 895 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
 896   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
 897     psraw(dst, src);
 898   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
 899     psllw(dst, src);
 900   } else {
 901     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
 902     psrlw(dst, src);
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 907   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
 908     vpsraw(dst, nds, src, vector_len);
 909   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
 910     vpsllw(dst, nds, src, vector_len);


































































































 911   } else {
 912     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
 913     vpsrlw(dst, nds, src, vector_len);

















 914   }
 915 }
 916 
 917 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
 918   if (opcode == Op_RShiftVL) {
 919     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
 920   } else if (opcode == Op_LShiftVL) {
 921     psllq(dst, src);
























 922   } else {
 923     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
 924     psrlq(dst, src);





































































































 925   }
 926 }
 927 
 928 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 929   if (opcode == Op_RShiftVL) {
 930     evpsraq(dst, nds, src, vector_len);
 931   } else if (opcode == Op_LShiftVL) {
 932     vpsllq(dst, nds, src, vector_len);









 933   } else {
 934     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
 935     vpsrlq(dst, nds, src, vector_len);












 936   }
 937 }
 938 
 939 // Reductions for vectors of ints, longs, floats, and doubles.











 940 
 941 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
 942   int vector_len = Assembler::AVX_128bit;
 943 
 944   switch (opcode) {
 945     case Op_AndReductionV:  pand(dst, src); break;
 946     case Op_OrReductionV:   por (dst, src); break;
 947     case Op_XorReductionV:  pxor(dst, src); break;
 948 



















 949     case Op_AddReductionVF: addss(dst, src); break;
 950     case Op_AddReductionVD: addsd(dst, src); break;
 951     case Op_AddReductionVI: paddd(dst, src); break;







 952     case Op_AddReductionVL: paddq(dst, src); break;
 953 
 954     case Op_MulReductionVF: mulss(dst, src); break;
 955     case Op_MulReductionVD: mulsd(dst, src); break;
 956     case Op_MulReductionVI: pmulld(dst, src); break;
 957     case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
 958 






 959     default: assert(false, "wrong opcode");
 960   }
 961 }
 962 
 963 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
 964   int vector_len = Assembler::AVX_256bit;
 965 
 966   switch (opcode) {
 967     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
 968     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
 969     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
 970 
 971     case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;


























 972     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
 973 
 974     case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;





 975     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
 976 
 977     default: assert(false, "wrong opcode");
 978   }
 979 }
 980 
 981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
 982                                   XMMRegister dst, XMMRegister src,
 983                                   XMMRegister vtmp1, XMMRegister vtmp2) {
 984   switch (opcode) {
 985     case Op_AddReductionVF:
 986     case Op_MulReductionVF:
 987       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
 988       break;
 989 
 990     case Op_AddReductionVD:
 991     case Op_MulReductionVD:
 992       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
 993       break;
 994 
 995     default: assert(false, "wrong opcode");
 996   }
 997 }
 998 







































 999 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1000                                 Register dst, Register src1, XMMRegister src2,
1001                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1002   switch (vlen) {
1003     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1004     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1005     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1006     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1007 
1008     default: assert(false, "wrong vector length");
1009   }
1010 }
1011 
1012 #ifdef _LP64
1013 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1014                                 Register dst, Register src1, XMMRegister src2,
1015                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1016   switch (vlen) {
1017     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1018     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;


1050       reduce2D(opcode, dst, src, vtmp1);
1051       break;
1052     case 4:
1053       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1054       break;
1055     case 8:
1056       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1057       break;
1058     default: assert(false, "wrong vector length");
1059   }
1060 }
1061 
1062 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1063   if (opcode == Op_AddReductionVI) {
1064     if (vtmp1 != src2) {
1065       movdqu(vtmp1, src2);
1066     }
1067     phaddd(vtmp1, vtmp1);
1068   } else {
1069     pshufd(vtmp1, src2, 0x1);
1070     reduce_operation_128(opcode, vtmp1, src2);
1071   }
1072   movdl(vtmp2, src1);
1073   reduce_operation_128(opcode, vtmp1, vtmp2);
1074   movdl(dst, vtmp1);
1075 }
1076 
1077 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1078   if (opcode == Op_AddReductionVI) {
1079     if (vtmp1 != src2) {
1080       movdqu(vtmp1, src2);
1081     }
1082     phaddd(vtmp1, src2);
1083     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1084   } else {
1085     pshufd(vtmp2, src2, 0xE);
1086     reduce_operation_128(opcode, vtmp2, src2);
1087     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1088   }
1089 }
1090 
1091 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1092   if (opcode == Op_AddReductionVI) {
1093     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1094     vextracti128_high(vtmp2, vtmp1);
1095     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1096     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1097   } else {
1098     vextracti128_high(vtmp1, src2);
1099     reduce_operation_128(opcode, vtmp1, src2);
1100     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1101   }
1102 }
1103 
1104 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1105   vextracti64x4_high(vtmp2, src2);
1106   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1107   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1108 }
1109 





























































































































1110 #ifdef _LP64
1111 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1112   pshufd(vtmp2, src2, 0xE);
1113   reduce_operation_128(opcode, vtmp2, src2);
1114   movdq(vtmp1, src1);
1115   reduce_operation_128(opcode, vtmp1, vtmp2);
1116   movdq(dst, vtmp1);
1117 }
1118 
1119 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1120   vextracti128_high(vtmp1, src2);
1121   reduce_operation_128(opcode, vtmp1, src2);
1122   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1123 }
1124 
1125 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1126   vextracti64x4_high(vtmp2, src2);
1127   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1128   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1129 }
1130 #endif // _LP64
1131 
1132 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1133   reduce_operation_128(opcode, dst, src);
1134   pshufd(vtmp, src, 0x1);
1135   reduce_operation_128(opcode, dst, vtmp);
1136 }
1137 
1138 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1139   reduce2F(opcode, dst, src, vtmp);
1140   pshufd(vtmp, src, 0x2);
1141   reduce_operation_128(opcode, dst, vtmp);
1142   pshufd(vtmp, src, 0x3);
1143   reduce_operation_128(opcode, dst, vtmp);
1144 }
1145 
1146 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1147   reduce4F(opcode, dst, src, vtmp2);
1148   vextractf128_high(vtmp2, src);
1149   reduce4F(opcode, dst, vtmp2, vtmp1);
1150 }
1151 
1152 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1153   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1154   vextracti64x4_high(vtmp1, src);
1155   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1156 }
1157 
1158 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1159   reduce_operation_128(opcode, dst, src);
1160   pshufd(vtmp, src, 0xE);
1161   reduce_operation_128(opcode, dst, vtmp);
1162 }
1163 
1164 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1165   reduce2D(opcode, dst, src, vtmp2);
1166   vextractf128_high(vtmp2, src);
1167   reduce2D(opcode, dst, vtmp2, vtmp1);
1168 }
1169 
1170 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1171   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1172   vextracti64x4_high(vtmp1, src);
1173   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);









































































































































































































1174 }
1175 
1176 //-------------------------------------------------------------------------------------------
1177 
1178 // IndexOf for constant substrings with size >= 8 chars
1179 // which don't need to be loaded through stack.
1180 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
1181                                          Register cnt1, Register cnt2,
1182                                          int int_cnt2,  Register result,
1183                                          XMMRegister vec, Register tmp,
1184                                          int ae) {
1185   ShortBranchVerifier sbv(this);
1186   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1187   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1188 
1189   // This method uses the pcmpestri instruction with bound registers
1190   //   inputs:
1191   //     xmm - substring
1192   //     rax - substring length (elements count)
1193   //     mem - scanned string




  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(k1, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask() {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(k1, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed


 852   if (dst != src) {
 853     movdqu(dst, src);
 854   }
 855   if (opcode == Op_AbsVF) {
 856     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 857   } else {
 858     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 859     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 860   }
 861 }
 862 
 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 864   if (opcode == Op_AbsVF) {
 865     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 866   } else {
 867     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 868     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 869   }
 870 }
 871 
 872 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 873   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 874 
 875   if (opcode == Op_MinV) {
 876     if (elem_bt == T_BYTE) {
 877       pminsb(dst, src);
 878     } else if (elem_bt == T_SHORT) {
 879       pminsw(dst, src);
 880     } else if (elem_bt == T_INT) {
 881       pminsd(dst, src);
 882     } else {
 883       assert(elem_bt == T_LONG, "required");
 884       assert(tmp == xmm0, "required");
 885       movdqu(xmm0, dst);
 886       pcmpgtq(xmm0, src);
 887       blendvpd(dst, src);  // xmm0 as mask
 888     }
 889   } else { // opcode == Op_MaxV
 890     if (elem_bt == T_BYTE) {
 891       pmaxsb(dst, src);
 892     } else if (elem_bt == T_SHORT) {
 893       pmaxsw(dst, src);
 894     } else if (elem_bt == T_INT) {
 895       pmaxsd(dst, src);
 896     } else {
 897       assert(elem_bt == T_LONG, "required");
 898       assert(tmp == xmm0, "required");
 899       movdqu(xmm0, src);
 900       pcmpgtq(xmm0, dst);
 901       blendvpd(dst, src);  // xmm0 as mask
 902     }
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 907                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 908                                  int vlen_enc) {
 909   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 910 
 911   if (opcode == Op_MinV) {
 912     if (elem_bt == T_BYTE) {
 913       vpminsb(dst, src1, src2, vlen_enc);
 914     } else if (elem_bt == T_SHORT) {
 915       vpminsw(dst, src1, src2, vlen_enc);
 916     } else if (elem_bt == T_INT) {
 917       vpminsd(dst, src1, src2, vlen_enc);
 918     } else {
 919       assert(elem_bt == T_LONG, "required");
 920       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 921         vpminsq(dst, src1, src2, vlen_enc);
 922       } else {
 923         vpcmpgtq(dst, src1, src2, vlen_enc);
 924         vblendvpd(dst, src1, src2, dst, vlen_enc);
 925       }
 926     }
 927   } else { // opcode == Op_MaxV
 928     if (elem_bt == T_BYTE) {
 929       vpmaxsb(dst, src1, src2, vlen_enc);
 930     } else if (elem_bt == T_SHORT) {
 931       vpmaxsw(dst, src1, src2, vlen_enc);
 932     } else if (elem_bt == T_INT) {
 933       vpmaxsd(dst, src1, src2, vlen_enc);
 934     } else {
 935       assert(elem_bt == T_LONG, "required");
 936       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 937         vpmaxsq(dst, src1, src2, vlen_enc);
 938       } else {
 939         vpcmpgtq(dst, src1, src2, vlen_enc);
 940         vblendvpd(dst, src2, src1, dst, vlen_enc);
 941       }
 942     }
 943   }
 944 }
 945 
 946 // Float/Double min max
 947 
 948 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 949                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 950                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 951                                    int vlen_enc) {
 952   assert(UseAVX > 0, "required");
 953   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 954          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 955   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 956 
 957   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 958   bool is_double_word = is_double_word_type(elem_bt);
 959 
 960   if (!is_double_word && is_min) {
 961     vblendvps(atmp, a, b, a, vlen_enc);
 962     vblendvps(btmp, b, a, a, vlen_enc);
 963     vminps(tmp, atmp, btmp, vlen_enc);
 964     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 965     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 966   } else if (!is_double_word && !is_min) {
 967     vblendvps(btmp, b, a, b, vlen_enc);
 968     vblendvps(atmp, a, b, b, vlen_enc);
 969     vmaxps(tmp, atmp, btmp, vlen_enc);
 970     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 971     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 972   } else if (is_double_word && is_min) {
 973     vblendvpd(atmp, a, b, a, vlen_enc);
 974     vblendvpd(btmp, b, a, a, vlen_enc);
 975     vminpd(tmp, atmp, btmp, vlen_enc);
 976     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 977     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 978   } else {
 979     assert(is_double_word && !is_min, "sanity");
 980     vblendvpd(btmp, b, a, b, vlen_enc);
 981     vblendvpd(atmp, a, b, b, vlen_enc);
 982     vmaxpd(tmp, atmp, btmp, vlen_enc);
 983     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 984     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 985   }
 986 }
 987 
 988 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 989                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 990                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 991                                     int vlen_enc) {
 992   assert(UseAVX > 2, "required");
 993   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 994          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 995   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 996 
 997   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 998   bool is_double_word = is_double_word_type(elem_bt);
 999   bool merge = true;
1000 
1001   if (!is_double_word && is_min) {
1002     evpmovd2m(ktmp, a, vlen_enc);
1003     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1004     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1005     vminps(dst, atmp, btmp, vlen_enc);
1006     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1007     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1008   } else if (!is_double_word && !is_min) {
1009     evpmovd2m(ktmp, b, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vmaxps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (is_double_word && is_min) {
1016     evpmovq2m(ktmp, a, vlen_enc);
1017     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1019     vminpd(dst, atmp, btmp, vlen_enc);
1020     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1022   } else {
1023     assert(is_double_word && !is_min, "sanity");
1024     evpmovq2m(ktmp, b, vlen_enc);
1025     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1026     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1027     vmaxpd(dst, atmp, btmp, vlen_enc);
1028     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1030   }
1031 }
1032 
1033 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1034   if (sign) {
1035     pmovsxbw(dst, src);
1036   } else {
1037     pmovzxbw(dst, src);
1038   }
1039 }
1040 
1041 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1042   if (sign) {
1043     vpmovsxbw(dst, src, vector_len);
1044   } else {
1045     vpmovzxbw(dst, src, vector_len);
1046   }
1047 }
1048 
1049 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1050   if (sign) {
1051     vpmovsxbd(dst, src, vector_len);


1052   } else {
1053     vpmovzxbd(dst, src, vector_len);

1054   }
1055 }
1056 
1057 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1058   if (sign) {
1059     vpmovsxwd(dst, src, vector_len);


1060   } else {
1061     vpmovzxwd(dst, src, vector_len);

1062   }
1063 }
1064 
1065 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1066   switch (opcode) {
1067     case Op_RShiftVI:  psrad(dst, shift); break;
1068     case Op_LShiftVI:  pslld(dst, shift); break;
1069     case Op_URShiftVI: psrld(dst, shift); break;
1070 
1071     default: assert(false, "%s", NodeClassNames[opcode]);

1072   }
1073 }
1074 
1075 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1076   switch (opcode) {
1077     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1078     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1079     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1080 
1081     default: assert(false, "%s", NodeClassNames[opcode]);
1082   }
1083 }
1084 
1085 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1086   switch (opcode) {
1087     case Op_RShiftVB:  // fall-through
1088     case Op_RShiftVS:  psraw(dst, shift); break;
1089 
1090     case Op_LShiftVB:  // fall-through
1091     case Op_LShiftVS:  psllw(dst, shift);   break;
1092 
1093     case Op_URShiftVS: // fall-through
1094     case Op_URShiftVB: psrlw(dst, shift);  break;
1095 
1096     default: assert(false, "%s", NodeClassNames[opcode]);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1101   switch (opcode) {
1102     case Op_RShiftVB:  // fall-through
1103     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1104 
1105     case Op_LShiftVB:  // fall-through
1106     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1107 
1108     case Op_URShiftVS: // fall-through
1109     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1110 
1111     default: assert(false, "%s", NodeClassNames[opcode]);
1112   }
1113 }
1114 
1115 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1116   switch (opcode) {
1117     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1118     case Op_LShiftVL:  psllq(dst, shift); break;
1119     case Op_URShiftVL: psrlq(dst, shift); break;
1120 
1121     default: assert(false, "%s", NodeClassNames[opcode]);
1122   }
1123 }
1124 
1125 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1126   switch (opcode) {
1127     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1128     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1129     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1130 
1131     default: assert(false, "%s", NodeClassNames[opcode]);
1132   }
1133 }
1134 
1135 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1136   switch (opcode) {
1137     case Op_RShiftVB:  // fall-through
1138     case Op_RShiftVS:  // fall-through
1139     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1140 
1141     case Op_LShiftVB:  // fall-through
1142     case Op_LShiftVS:  // fall-through
1143     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1144 
1145     case Op_URShiftVB: // fall-through
1146     case Op_URShiftVS: // fall-through
1147     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1148 
1149     default: assert(false, "%s", NodeClassNames[opcode]);
1150   }
1151 }
1152 
1153 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1154   switch (opcode) {
1155     case Op_RShiftVB:  // fall-through
1156     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1157 
1158     case Op_LShiftVB:  // fall-through
1159     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1160 
1161     case Op_URShiftVB: // fall-through
1162     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1163 
1164     default: assert(false, "%s", NodeClassNames[opcode]);
1165   }
1166 }
1167 
1168 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1169   assert(UseAVX >= 2, "required");
1170   switch (opcode) {
1171     case Op_RShiftVL: {
1172       if (UseAVX > 2) {
1173         assert(tmp == xnoreg, "not used");
1174         if (!VM_Version::supports_avx512vl()) {
1175           vlen_enc = Assembler::AVX_512bit;
1176         }
1177         evpsravq(dst, src, shift, vlen_enc);
1178       } else {
1179         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1180         vpsrlvq(dst, src, shift, vlen_enc);
1181         vpsrlvq(tmp, tmp, shift, vlen_enc);
1182         vpxor(dst, dst, tmp, vlen_enc);
1183         vpsubq(dst, dst, tmp, vlen_enc);
1184       }
1185       break;
1186     }
1187     case Op_LShiftVL: {
1188       assert(tmp == xnoreg, "not used");
1189       vpsllvq(dst, src, shift, vlen_enc);
1190       break;
1191     }
1192     case Op_URShiftVL: {
1193       assert(tmp == xnoreg, "not used");
1194       vpsrlvq(dst, src, shift, vlen_enc);
1195       break;
1196     }
1197     default: assert(false, "%s", NodeClassNames[opcode]);
1198   }
1199 }
1200 
1201 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1202 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1203   assert(opcode == Op_LShiftVB ||
1204          opcode == Op_RShiftVB ||
1205          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1206   bool sign = (opcode != Op_URShiftVB);
1207   assert(vector_len == 0, "required");
1208   vextendbd(sign, dst, src, 1);
1209   vpmovzxbd(vtmp, shift, 1);
1210   varshiftd(opcode, dst, dst, vtmp, 1);
1211   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1212   vextracti128_high(vtmp, dst);
1213   vpackusdw(dst, dst, vtmp, 0);
1214 }
1215 
1216 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1217 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1218   assert(opcode == Op_LShiftVB ||
1219          opcode == Op_RShiftVB ||
1220          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1221   bool sign = (opcode != Op_URShiftVB);
1222   int ext_vector_len = vector_len + 1;
1223   vextendbw(sign, dst, src, ext_vector_len);
1224   vpmovzxbw(vtmp, shift, ext_vector_len);
1225   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1226   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1227   if (vector_len == 0) {
1228     vextracti128_high(vtmp, dst);
1229     vpackuswb(dst, dst, vtmp, vector_len);
1230   } else {
1231     vextracti64x4_high(vtmp, dst);
1232     vpackuswb(dst, dst, vtmp, vector_len);
1233     vpermq(dst, dst, 0xD8, vector_len);
1234   }
1235 }
1236 
1237 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1238   switch(typ) {
1239     case T_BYTE:
1240       pinsrb(dst, val, idx);
1241       break;
1242     case T_SHORT:
1243       pinsrw(dst, val, idx);
1244       break;
1245     case T_INT:
1246       pinsrd(dst, val, idx);
1247       break;
1248     case T_LONG:
1249       pinsrq(dst, val, idx);
1250       break;
1251     default:
1252       assert(false,"Should not reach here.");
1253       break;
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1258   switch(typ) {
1259     case T_BYTE:
1260       vpinsrb(dst, src, val, idx);
1261       break;
1262     case T_SHORT:
1263       vpinsrw(dst, src, val, idx);
1264       break;
1265     case T_INT:
1266       vpinsrd(dst, src, val, idx);
1267       break;
1268     case T_LONG:
1269       vpinsrq(dst, src, val, idx);
1270       break;
1271     default:
1272       assert(false,"Should not reach here.");
1273       break;
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1278   switch(typ) {
1279     case T_INT:
1280       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1281       break;
1282     case T_FLOAT:
1283       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1284       break;
1285     case T_LONG:
1286       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1287       break;
1288     case T_DOUBLE:
1289       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1290       break;
1291     default:
1292       assert(false,"Should not reach here.");
1293       break;
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1298   switch(typ) {
1299     case T_INT:
1300       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1301       break;
1302     case T_FLOAT:
1303       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1304       break;
1305     case T_LONG:
1306       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1307       break;
1308     case T_DOUBLE:
1309       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1310       break;
1311     default:
1312       assert(false,"Should not reach here.");
1313       break;
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1318   switch(typ) {
1319     case T_INT:
1320       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1321       break;
1322     case T_FLOAT:
1323       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1324       break;
1325     case T_LONG:
1326       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1327       break;
1328     case T_DOUBLE:
1329       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1330       break;
1331     default:
1332       assert(false,"Should not reach here.");
1333       break;
1334   }
1335 }
1336 
1337 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1338   if (vlen_in_bytes <= 16) {
1339     pxor (dst, dst);
1340     psubb(dst, src);
1341     switch (elem_bt) {
1342       case T_BYTE:   /* nothing to do */ break;
1343       case T_SHORT:  pmovsxbw(dst, dst); break;
1344       case T_INT:    pmovsxbd(dst, dst); break;
1345       case T_FLOAT:  pmovsxbd(dst, dst); break;
1346       case T_LONG:   pmovsxbq(dst, dst); break;
1347       case T_DOUBLE: pmovsxbq(dst, dst); break;
1348 
1349       default: assert(false, "%s", type2name(elem_bt));
1350     }
1351   } else {
1352     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1353 
1354     vpxor (dst, dst, dst, vlen_enc);
1355     vpsubb(dst, dst, src, vlen_enc);
1356     switch (elem_bt) {
1357       case T_BYTE:   /* nothing to do */            break;
1358       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1359       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1360       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1361       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1362       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1363 
1364       default: assert(false, "%s", type2name(elem_bt));
1365     }
1366   }
1367 }
1368 
1369 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1370   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1371   if (vlen_in_bytes <= 16) {
1372     movdqu(dst, addr, scratch);
1373   } else if (vlen_in_bytes == 32) {
1374     vmovdqu(dst, addr, scratch);
1375   } else {
1376     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1377     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1378   }
1379 }
1380 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1381 
1382 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1383   int vector_len = Assembler::AVX_128bit;
1384 
1385   switch (opcode) {
1386     case Op_AndReductionV:  pand(dst, src); break;
1387     case Op_OrReductionV:   por (dst, src); break;
1388     case Op_XorReductionV:  pxor(dst, src); break;
1389     case Op_MinReductionV:
1390       switch (typ) {
1391         case T_BYTE:        pminsb(dst, src); break;
1392         case T_SHORT:       pminsw(dst, src); break;
1393         case T_INT:         pminsd(dst, src); break;
1394         case T_LONG:        assert(UseAVX > 2, "required");
1395                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1396         default:            assert(false, "wrong type");
1397       }
1398       break;
1399     case Op_MaxReductionV:
1400       switch (typ) {
1401         case T_BYTE:        pmaxsb(dst, src); break;
1402         case T_SHORT:       pmaxsw(dst, src); break;
1403         case T_INT:         pmaxsd(dst, src); break;
1404         case T_LONG:        assert(UseAVX > 2, "required");
1405                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1406         default:            assert(false, "wrong type");
1407       }
1408       break;
1409     case Op_AddReductionVF: addss(dst, src); break;
1410     case Op_AddReductionVD: addsd(dst, src); break;
1411     case Op_AddReductionVI:
1412       switch (typ) {
1413         case T_BYTE:        paddb(dst, src); break;
1414         case T_SHORT:       paddw(dst, src); break;
1415         case T_INT:         paddd(dst, src); break;
1416         default:            assert(false, "wrong type");
1417       }
1418       break;
1419     case Op_AddReductionVL: paddq(dst, src); break;

1420     case Op_MulReductionVF: mulss(dst, src); break;
1421     case Op_MulReductionVD: mulsd(dst, src); break;
1422     case Op_MulReductionVI:
1423       switch (typ) {
1424         case T_SHORT:       pmullw(dst, src); break;
1425         case T_INT:         pmulld(dst, src); break;
1426         default:            assert(false, "wrong type");
1427       }
1428       break;
1429     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1430                             vpmullq(dst, dst, src, vector_len); break;
1431     default:                assert(false, "wrong opcode");
1432   }
1433 }
1434 
1435 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1436   int vector_len = Assembler::AVX_256bit;
1437 
1438   switch (opcode) {
1439     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1440     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1441     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1442     case Op_MinReductionV:
1443       switch (typ) {
1444         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1445         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1446         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1447         case T_LONG:        assert(UseAVX > 2, "required");
1448                             vpminsq(dst, src1, src2, vector_len); break;
1449         default:            assert(false, "wrong type");
1450       }
1451       break;
1452     case Op_MaxReductionV:
1453       switch (typ) {
1454         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1455         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1456         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1457         case T_LONG:        assert(UseAVX > 2, "required");
1458                             vpmaxsq(dst, src1, src2, vector_len); break;
1459         default:            assert(false, "wrong type");
1460       }
1461       break;
1462     case Op_AddReductionVI:
1463       switch (typ) {
1464         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1465         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1466         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1467         default:            assert(false, "wrong type");
1468       }
1469       break;
1470     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1471     case Op_MulReductionVI:
1472       switch (typ) {
1473         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1474         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1475         default:            assert(false, "wrong type");
1476       }
1477       break;
1478     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;

1479     default:                assert(false, "wrong opcode");
1480   }
1481 }
1482 
1483 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1484                                   XMMRegister dst, XMMRegister src,
1485                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1486   switch (opcode) {
1487     case Op_AddReductionVF:
1488     case Op_MulReductionVF:
1489       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1490       break;
1491 
1492     case Op_AddReductionVD:
1493     case Op_MulReductionVD:
1494       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1495       break;
1496 
1497     default: assert(false, "wrong opcode");
1498   }
1499 }
1500 
1501 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1502                              Register dst, Register src1, XMMRegister src2,
1503                              XMMRegister vtmp1, XMMRegister vtmp2) {
1504   switch (vlen) {
1505     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1506     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1507     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1508     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1509 
1510     default: assert(false, "wrong vector length");
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1515                              Register dst, Register src1, XMMRegister src2,
1516                              XMMRegister vtmp1, XMMRegister vtmp2) {
1517   switch (vlen) {
1518     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1519     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1520     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1521     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1522 
1523     default: assert(false, "wrong vector length");
1524   }
1525 }
1526 
1527 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1528                              Register dst, Register src1, XMMRegister src2,
1529                              XMMRegister vtmp1, XMMRegister vtmp2) {
1530   switch (vlen) {
1531     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1532     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1533     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1534     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1535 
1536     default: assert(false, "wrong vector length");
1537   }
1538 }
1539 
1540 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1541                              Register dst, Register src1, XMMRegister src2,
1542                              XMMRegister vtmp1, XMMRegister vtmp2) {
1543   switch (vlen) {
1544     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1545     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1546     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1547     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1548 
1549     default: assert(false, "wrong vector length");
1550   }
1551 }
1552 
1553 #ifdef _LP64
1554 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1555                              Register dst, Register src1, XMMRegister src2,
1556                              XMMRegister vtmp1, XMMRegister vtmp2) {
1557   switch (vlen) {
1558     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1559     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;


1591       reduce2D(opcode, dst, src, vtmp1);
1592       break;
1593     case 4:
1594       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1595       break;
1596     case 8:
1597       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1598       break;
1599     default: assert(false, "wrong vector length");
1600   }
1601 }
1602 
1603 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1604   if (opcode == Op_AddReductionVI) {
1605     if (vtmp1 != src2) {
1606       movdqu(vtmp1, src2);
1607     }
1608     phaddd(vtmp1, vtmp1);
1609   } else {
1610     pshufd(vtmp1, src2, 0x1);
1611     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1612   }
1613   movdl(vtmp2, src1);
1614   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1615   movdl(dst, vtmp1);
1616 }
1617 
1618 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1619   if (opcode == Op_AddReductionVI) {
1620     if (vtmp1 != src2) {
1621       movdqu(vtmp1, src2);
1622     }
1623     phaddd(vtmp1, src2);
1624     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1625   } else {
1626     pshufd(vtmp2, src2, 0xE);
1627     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1628     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1629   }
1630 }
1631 
1632 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1633   if (opcode == Op_AddReductionVI) {
1634     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1635     vextracti128_high(vtmp2, vtmp1);
1636     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1637     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1638   } else {
1639     vextracti128_high(vtmp1, src2);
1640     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1641     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1642   }
1643 }
1644 
1645 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1646   vextracti64x4_high(vtmp2, src2);
1647   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1648   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1649 }
1650 
1651 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1652   pshufd(vtmp2, src2, 0x1);
1653   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1654   movdqu(vtmp1, vtmp2);
1655   psrldq(vtmp1, 2);
1656   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1657   movdqu(vtmp2, vtmp1);
1658   psrldq(vtmp2, 1);
1659   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1660   movdl(vtmp2, src1);
1661   pmovsxbd(vtmp1, vtmp1);
1662   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1663   pextrb(dst, vtmp1, 0x0);
1664   movsbl(dst, dst);
1665 }
1666 
1667 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1668   pshufd(vtmp1, src2, 0xE);
1669   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1670   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1671 }
1672 
1673 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1674   vextracti128_high(vtmp2, src2);
1675   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1676   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1677 }
1678 
1679 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1680   vextracti64x4_high(vtmp1, src2);
1681   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1682   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1683 }
1684 
1685 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1686   pmovsxbw(vtmp2, src2);
1687   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1688 }
1689 
1690 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1691   if (UseAVX > 1) {
1692     int vector_len = Assembler::AVX_256bit;
1693     vpmovsxbw(vtmp1, src2, vector_len);
1694     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1695   } else {
1696     pmovsxbw(vtmp2, src2);
1697     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1698     pshufd(vtmp2, src2, 0x1);
1699     pmovsxbw(vtmp2, src2);
1700     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1701   }
1702 }
1703 
1704 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1705   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1706     int vector_len = Assembler::AVX_512bit;
1707     vpmovsxbw(vtmp1, src2, vector_len);
1708     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1709   } else {
1710     assert(UseAVX >= 2,"Should not reach here.");
1711     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1712     vextracti128_high(vtmp2, src2);
1713     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1714   }
1715 }
1716 
1717 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1718   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1719   vextracti64x4_high(vtmp2, src2);
1720   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1721 }
1722 
1723 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1724   if (opcode == Op_AddReductionVI) {
1725     if (vtmp1 != src2) {
1726       movdqu(vtmp1, src2);
1727     }
1728     phaddw(vtmp1, vtmp1);
1729     phaddw(vtmp1, vtmp1);
1730   } else {
1731     pshufd(vtmp2, src2, 0x1);
1732     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1733     movdqu(vtmp1, vtmp2);
1734     psrldq(vtmp1, 2);
1735     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1736   }
1737   movdl(vtmp2, src1);
1738   pmovsxwd(vtmp1, vtmp1);
1739   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1740   pextrw(dst, vtmp1, 0x0);
1741   movswl(dst, dst);
1742 }
1743 
1744 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1745   if (opcode == Op_AddReductionVI) {
1746     if (vtmp1 != src2) {
1747       movdqu(vtmp1, src2);
1748     }
1749     phaddw(vtmp1, src2);
1750   } else {
1751     pshufd(vtmp1, src2, 0xE);
1752     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1753   }
1754   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1755 }
1756 
1757 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1758   if (opcode == Op_AddReductionVI) {
1759     int vector_len = Assembler::AVX_256bit;
1760     vphaddw(vtmp2, src2, src2, vector_len);
1761     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1762   } else {
1763     vextracti128_high(vtmp2, src2);
1764     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1765   }
1766   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1767 }
1768 
1769 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1770   int vector_len = Assembler::AVX_256bit;
1771   vextracti64x4_high(vtmp1, src2);
1772   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1773   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1774 }
1775 
1776 #ifdef _LP64
1777 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1778   pshufd(vtmp2, src2, 0xE);
1779   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1780   movdq(vtmp1, src1);
1781   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1782   movdq(dst, vtmp1);
1783 }
1784 
1785 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1786   vextracti128_high(vtmp1, src2);
1787   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1788   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1789 }
1790 
1791 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1792   vextracti64x4_high(vtmp2, src2);
1793   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1794   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1795 }
1796 #endif // _LP64
1797 
1798 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1799   reduce_operation_128(T_FLOAT, opcode, dst, src);
1800   pshufd(vtmp, src, 0x1);
1801   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1802 }
1803 
1804 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1805   reduce2F(opcode, dst, src, vtmp);
1806   pshufd(vtmp, src, 0x2);
1807   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1808   pshufd(vtmp, src, 0x3);
1809   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1810 }
1811 
1812 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1813   reduce4F(opcode, dst, src, vtmp2);
1814   vextractf128_high(vtmp2, src);
1815   reduce4F(opcode, dst, vtmp2, vtmp1);
1816 }
1817 
1818 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1819   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1820   vextracti64x4_high(vtmp1, src);
1821   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1822 }
1823 
1824 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1825   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1826   pshufd(vtmp, src, 0xE);
1827   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1828 }
1829 
1830 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1831   reduce2D(opcode, dst, src, vtmp2);
1832   vextractf128_high(vtmp2, src);
1833   reduce2D(opcode, dst, vtmp2, vtmp1);
1834 }
1835 
1836 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1837   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1838   vextracti64x4_high(vtmp1, src);
1839   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1840 }
1841 
1842 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1843                                           XMMRegister dst, XMMRegister src,
1844                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1845                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1846   int permconst[] = {1, 14};
1847   XMMRegister wsrc = src;
1848   XMMRegister wdst = xmm_0;
1849   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1850 
1851   int vlen_enc = Assembler::AVX_128bit;
1852   if (vlen == 16) {
1853     vlen_enc = Assembler::AVX_256bit;
1854   }
1855 
1856   for (int i = log2(vlen) - 1; i >=0; i--) {
1857     if (i == 0 && !is_dst_valid) {
1858       wdst = dst;
1859     }
1860     if (i == 3) {
1861       vextracti64x4_high(wtmp, wsrc);
1862     } else if (i == 2) {
1863       vextracti128_high(wtmp, wsrc);
1864     } else { // i = [0,1]
1865       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1866     }
1867     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1868     wsrc = wdst;
1869     vlen_enc = Assembler::AVX_128bit;
1870   }
1871   if (is_dst_valid) {
1872     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1873   }
1874 }
1875 
1876 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1877                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1878                                         XMMRegister xmm_0, XMMRegister xmm_1) {
1879   XMMRegister wsrc = src;
1880   XMMRegister wdst = xmm_0;
1881   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1882   int vlen_enc = Assembler::AVX_128bit;
1883   if (vlen == 8) {
1884     vlen_enc = Assembler::AVX_256bit;
1885   }
1886   for (int i = log2(vlen) - 1; i >=0; i--) {
1887     if (i == 0 && !is_dst_valid) {
1888       wdst = dst;
1889     }
1890     if (i == 1) {
1891       vextracti128_high(wtmp, wsrc);
1892     } else if (i == 2) {
1893       vextracti64x4_high(wtmp, wsrc);
1894     } else {
1895       assert(i == 0, "%d", i);
1896       vpermilpd(wtmp, wsrc, 1, vlen_enc);
1897     }
1898     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1899     wsrc = wdst;
1900     vlen_enc = Assembler::AVX_128bit;
1901   }
1902   if (is_dst_valid) {
1903     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1904   }
1905 }
1906 
1907 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
1908   switch (bt) {
1909     case T_BYTE:  pextrb(dst, src, idx); break;
1910     case T_SHORT: pextrw(dst, src, idx); break;
1911     case T_INT:   pextrd(dst, src, idx); break;
1912     case T_LONG:  pextrq(dst, src, idx); break;
1913 
1914     default:
1915       assert(false,"Should not reach here.");
1916       break;
1917   }
1918 }
1919 
1920 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
1921   int esize =  type2aelembytes(typ);
1922   int elem_per_lane = 16/esize;
1923   int lane = elemindex / elem_per_lane;
1924   int eindex = elemindex % elem_per_lane;
1925 
1926   if (lane >= 2) {
1927     assert(UseAVX > 2, "required");
1928     vextractf32x4(dst, src, lane & 3);
1929     return dst;
1930   } else if (lane > 0) {
1931     assert(UseAVX > 0, "required");
1932     vextractf128(dst, src, lane);
1933     return dst;
1934   } else {
1935     return src;
1936   }
1937 }
1938 
1939 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
1940   int esize =  type2aelembytes(typ);
1941   int elem_per_lane = 16/esize;
1942   int eindex = elemindex % elem_per_lane;
1943   assert(is_integral_type(typ),"required");
1944 
1945   if (eindex == 0) {
1946     if (typ == T_LONG) {
1947       movq(dst, src);
1948     } else {
1949       movdl(dst, src);
1950       if (typ == T_BYTE)
1951         movsbl(dst, dst);
1952       else if (typ == T_SHORT)
1953         movswl(dst, dst);
1954     }
1955   } else {
1956     extract(typ, dst, src, eindex);
1957   }
1958 }
1959 
1960 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
1961   int esize =  type2aelembytes(typ);
1962   int elem_per_lane = 16/esize;
1963   int eindex = elemindex % elem_per_lane;
1964   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
1965 
1966   if (eindex == 0) {
1967     movq(dst, src);
1968   } else {
1969     if (typ == T_FLOAT) {
1970       if (UseAVX == 0) {
1971         movdqu(dst, src);
1972         pshufps(dst, dst, eindex);
1973       } else {
1974         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
1975       }
1976     } else {
1977       if (UseAVX == 0) {
1978         movdqu(dst, src);
1979         psrldq(dst, eindex*esize);
1980       } else {
1981         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
1982       }
1983       movq(dst, dst);
1984     }
1985   }
1986   // Zero upper bits
1987   if (typ == T_FLOAT) {
1988     if (UseAVX == 0) {
1989       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
1990       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
1991       pand(dst, vtmp);
1992     } else {
1993       assert((tmp != noreg), "required.");
1994       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
1995     }
1996   }
1997 }
1998 
1999 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2000   switch(typ) {
2001     case T_BYTE:
2002       evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2003       break;
2004     case T_SHORT:
2005       evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2006       break;
2007     case T_INT:
2008     case T_FLOAT:
2009       evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2010       break;
2011     case T_LONG:
2012     case T_DOUBLE:
2013       evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2014       break;
2015     default:
2016       assert(false,"Should not reach here.");
2017       break;
2018   }
2019 }
2020 
2021 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2022   switch(typ) {
2023     case T_BYTE:
2024       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2025       break;
2026     case T_SHORT:
2027       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2028       break;
2029     case T_INT:
2030     case T_FLOAT:
2031       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2032       break;
2033     case T_LONG:
2034     case T_DOUBLE:
2035       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2036       break;
2037     default:
2038       assert(false,"Should not reach here.");
2039       break;
2040   }
2041 }
2042 
2043 //-------------------------------------------------------------------------------------------
2044 
2045 // IndexOf for constant substrings with size >= 8 chars
2046 // which don't need to be loaded through stack.
2047 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2048                                          Register cnt1, Register cnt2,
2049                                          int int_cnt2,  Register result,
2050                                          XMMRegister vec, Register tmp,
2051                                          int ae) {
2052   ShortBranchVerifier sbv(this);
2053   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2054   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2055 
2056   // This method uses the pcmpestri instruction with bound registers
2057   //   inputs:
2058   //     xmm - substring
2059   //     rax - substring length (elements count)
2060   //     mem - scanned string


< prev index next >