< prev index next >

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Print this page
rev 61241 : manual merge with vectorIntrinsics

@@ -31,10 +31,25 @@
 #include "opto/opcodes.hpp"
 #include "runtime/biasedLocking.hpp"
 #include "runtime/objectMonitor.hpp"
 #include "runtime/stubRoutines.hpp"
 
+inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
+  switch (vlen_in_bytes) {
+    case  4: // fall-through
+    case  8: // fall-through
+    case 16: return Assembler::AVX_128bit;
+    case 32: return Assembler::AVX_256bit;
+    case 64: return Assembler::AVX_512bit;
+
+    default: {
+      ShouldNotReachHere();
+      return Assembler::AVX_NoVec;
+    }
+  }
+}
+
 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
   guarantee(PostLoopMultiversioning, "must be");
   Assembler::movl(dst, 1);
   Assembler::shlxl(dst, dst, src);
   Assembler::decl(dst);

@@ -852,10 +867,171 @@
     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
   }
 }
 
+void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
+  assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
+
+  if (opcode == Op_MinV) {
+    if (elem_bt == T_BYTE) {
+      pminsb(dst, src);
+    } else if (elem_bt == T_SHORT) {
+      pminsw(dst, src);
+    } else if (elem_bt == T_INT) {
+      pminsd(dst, src);
+    } else {
+      assert(elem_bt == T_LONG, "required");
+      assert(tmp == xmm0, "required");
+      movdqu(xmm0, dst);
+      pcmpgtq(xmm0, src);
+      blendvpd(dst, src);  // xmm0 as mask
+    }
+  } else { // opcode == Op_MaxV
+    if (elem_bt == T_BYTE) {
+      pmaxsb(dst, src);
+    } else if (elem_bt == T_SHORT) {
+      pmaxsw(dst, src);
+    } else if (elem_bt == T_INT) {
+      pmaxsd(dst, src);
+    } else {
+      assert(elem_bt == T_LONG, "required");
+      assert(tmp == xmm0, "required");
+      movdqu(xmm0, src);
+      pcmpgtq(xmm0, dst);
+      blendvpd(dst, src);  // xmm0 as mask
+    }
+  }
+}
+
+void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
+                                 XMMRegister dst, XMMRegister src1, XMMRegister src2,
+                                 int vlen_enc) {
+  assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
+
+  if (opcode == Op_MinV) {
+    if (elem_bt == T_BYTE) {
+      vpminsb(dst, src1, src2, vlen_enc);
+    } else if (elem_bt == T_SHORT) {
+      vpminsw(dst, src1, src2, vlen_enc);
+    } else if (elem_bt == T_INT) {
+      vpminsd(dst, src1, src2, vlen_enc);
+    } else {
+      assert(elem_bt == T_LONG, "required");
+      if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
+        vpminsq(dst, src1, src2, vlen_enc);
+      } else {
+        vpcmpgtq(dst, src1, src2, vlen_enc);
+        vblendvpd(dst, src1, src2, dst, vlen_enc);
+      }
+    }
+  } else { // opcode == Op_MaxV
+    if (elem_bt == T_BYTE) {
+      vpmaxsb(dst, src1, src2, vlen_enc);
+    } else if (elem_bt == T_SHORT) {
+      vpmaxsw(dst, src1, src2, vlen_enc);
+    } else if (elem_bt == T_INT) {
+      vpmaxsd(dst, src1, src2, vlen_enc);
+    } else {
+      assert(elem_bt == T_LONG, "required");
+      if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
+        vpmaxsq(dst, src1, src2, vlen_enc);
+      } else {
+        vpcmpgtq(dst, src1, src2, vlen_enc);
+        vblendvpd(dst, src2, src1, dst, vlen_enc);
+      }
+    }
+  }
+}
+
+// Float/Double min max
+
+void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
+                                   XMMRegister dst, XMMRegister a, XMMRegister b,
+                                   XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
+                                   int vlen_enc) {
+  assert(UseAVX > 0, "required");
+  assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
+         opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
+  assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
+
+  bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
+  bool is_double_word = is_double_word_type(elem_bt);
+
+  if (!is_double_word && is_min) {
+    vblendvps(atmp, a, b, a, vlen_enc);
+    vblendvps(btmp, b, a, a, vlen_enc);
+    vminps(tmp, atmp, btmp, vlen_enc);
+    vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    vblendvps(dst, tmp, atmp, btmp, vlen_enc);
+  } else if (!is_double_word && !is_min) {
+    vblendvps(btmp, b, a, b, vlen_enc);
+    vblendvps(atmp, a, b, b, vlen_enc);
+    vmaxps(tmp, atmp, btmp, vlen_enc);
+    vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    vblendvps(dst, tmp, atmp, btmp, vlen_enc);
+  } else if (is_double_word && is_min) {
+    vblendvpd(atmp, a, b, a, vlen_enc);
+    vblendvpd(btmp, b, a, a, vlen_enc);
+    vminpd(tmp, atmp, btmp, vlen_enc);
+    vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
+  } else {
+    assert(is_double_word && !is_min, "sanity");
+    vblendvpd(btmp, b, a, b, vlen_enc);
+    vblendvpd(atmp, a, b, b, vlen_enc);
+    vmaxpd(tmp, atmp, btmp, vlen_enc);
+    vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
+  }
+}
+
+void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
+                                    XMMRegister dst, XMMRegister a, XMMRegister b,
+                                    KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
+                                    int vlen_enc) {
+  assert(UseAVX > 2, "required");
+  assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
+         opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
+  assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
+
+  bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
+  bool is_double_word = is_double_word_type(elem_bt);
+  bool merge = true;
+
+  if (!is_double_word && is_min) {
+    evpmovd2m(ktmp, a, vlen_enc);
+    evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
+    evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
+    vminps(dst, atmp, btmp, vlen_enc);
+    evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
+  } else if (!is_double_word && !is_min) {
+    evpmovd2m(ktmp, b, vlen_enc);
+    evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
+    evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
+    vmaxps(dst, atmp, btmp, vlen_enc);
+    evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
+  } else if (is_double_word && is_min) {
+    evpmovq2m(ktmp, a, vlen_enc);
+    evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
+    evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
+    vminpd(dst, atmp, btmp, vlen_enc);
+    evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
+  } else {
+    assert(is_double_word && !is_min, "sanity");
+    evpmovq2m(ktmp, b, vlen_enc);
+    evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
+    evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
+    vmaxpd(dst, atmp, btmp, vlen_enc);
+    evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+    evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
+  }
+}
+
 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
   if (sign) {
     pmovsxbw(dst, src);
   } else {
     pmovzxbw(dst, src);

@@ -868,114 +1044,440 @@
   } else {
     vpmovzxbw(dst, src, vector_len);
   }
 }
 
-void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
-  if (opcode == Op_RShiftVI) {
-    psrad(dst, src);
-  } else if (opcode == Op_LShiftVI) {
-    pslld(dst, src);
+void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
+  if (sign) {
+    vpmovsxbd(dst, src, vector_len);
   } else {
-    assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
-    psrld(dst, src);
+    vpmovzxbd(dst, src, vector_len);
   }
 }
 
-void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  if (opcode == Op_RShiftVI) {
-    vpsrad(dst, nds, src, vector_len);
-  } else if (opcode == Op_LShiftVI) {
-    vpslld(dst, nds, src, vector_len);
+void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
+  if (sign) {
+    vpmovsxwd(dst, src, vector_len);
   } else {
-    assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
-    vpsrld(dst, nds, src, vector_len);
+    vpmovzxwd(dst, src, vector_len);
   }
 }
 
-void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
-  if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
-    psraw(dst, src);
-  } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
-    psllw(dst, src);
-  } else {
-    assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
-    psrlw(dst, src);
+void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
+  switch (opcode) {
+    case Op_RShiftVI:  psrad(dst, shift); break;
+    case Op_LShiftVI:  pslld(dst, shift); break;
+    case Op_URShiftVI: psrld(dst, shift); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
   }
 }
 
-void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
-    vpsraw(dst, nds, src, vector_len);
-  } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
-    vpsllw(dst, nds, src, vector_len);
+void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
+  switch (opcode) {
+    case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
+    case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
+    case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
+  switch (opcode) {
+    case Op_RShiftVB:  // fall-through
+    case Op_RShiftVS:  psraw(dst, shift); break;
+
+    case Op_LShiftVB:  // fall-through
+    case Op_LShiftVS:  psllw(dst, shift);   break;
+
+    case Op_URShiftVS: // fall-through
+    case Op_URShiftVB: psrlw(dst, shift);  break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
+  switch (opcode) {
+    case Op_RShiftVB:  // fall-through
+    case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
+
+    case Op_LShiftVB:  // fall-through
+    case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
+
+    case Op_URShiftVS: // fall-through
+    case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
+  switch (opcode) {
+    case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
+    case Op_LShiftVL:  psllq(dst, shift); break;
+    case Op_URShiftVL: psrlq(dst, shift); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
+  switch (opcode) {
+    case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
+    case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
+    case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
+  switch (opcode) {
+    case Op_RShiftVB:  // fall-through
+    case Op_RShiftVS:  // fall-through
+    case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
+
+    case Op_LShiftVB:  // fall-through
+    case Op_LShiftVS:  // fall-through
+    case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
+
+    case Op_URShiftVB: // fall-through
+    case Op_URShiftVS: // fall-through
+    case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
+  switch (opcode) {
+    case Op_RShiftVB:  // fall-through
+    case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
+
+    case Op_LShiftVB:  // fall-through
+    case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
+
+    case Op_URShiftVB: // fall-through
+    case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
+
+    default: assert(false, "%s", NodeClassNames[opcode]);
+  }
+}
+
+void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
+  assert(UseAVX >= 2, "required");
+  switch (opcode) {
+    case Op_RShiftVL: {
+      if (UseAVX > 2) {
+        assert(tmp == xnoreg, "not used");
+        if (!VM_Version::supports_avx512vl()) {
+          vlen_enc = Assembler::AVX_512bit;
+        }
+        evpsravq(dst, src, shift, vlen_enc);
   } else {
-    assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
-    vpsrlw(dst, nds, src, vector_len);
+        vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
+        vpsrlvq(dst, src, shift, vlen_enc);
+        vpsrlvq(tmp, tmp, shift, vlen_enc);
+        vpxor(dst, dst, tmp, vlen_enc);
+        vpsubq(dst, dst, tmp, vlen_enc);
+      }
+      break;
+    }
+    case Op_LShiftVL: {
+      assert(tmp == xnoreg, "not used");
+      vpsllvq(dst, src, shift, vlen_enc);
+      break;
+    }
+    case Op_URShiftVL: {
+      assert(tmp == xnoreg, "not used");
+      vpsrlvq(dst, src, shift, vlen_enc);
+      break;
+    }
+    default: assert(false, "%s", NodeClassNames[opcode]);
   }
 }
 
-void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
-  if (opcode == Op_RShiftVL) {
-    psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
-  } else if (opcode == Op_LShiftVL) {
-    psllq(dst, src);
+// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
+void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
+  assert(opcode == Op_LShiftVB ||
+         opcode == Op_RShiftVB ||
+         opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
+  bool sign = (opcode != Op_URShiftVB);
+  assert(vector_len == 0, "required");
+  vextendbd(sign, dst, src, 1);
+  vpmovzxbd(vtmp, shift, 1);
+  varshiftd(opcode, dst, dst, vtmp, 1);
+  vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
+  vextracti128_high(vtmp, dst);
+  vpackusdw(dst, dst, vtmp, 0);
+}
+
+// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
+void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
+  assert(opcode == Op_LShiftVB ||
+         opcode == Op_RShiftVB ||
+         opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
+  bool sign = (opcode != Op_URShiftVB);
+  int ext_vector_len = vector_len + 1;
+  vextendbw(sign, dst, src, ext_vector_len);
+  vpmovzxbw(vtmp, shift, ext_vector_len);
+  varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
+  vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
+  if (vector_len == 0) {
+    vextracti128_high(vtmp, dst);
+    vpackuswb(dst, dst, vtmp, vector_len);
   } else {
-    assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
-    psrlq(dst, src);
+    vextracti64x4_high(vtmp, dst);
+    vpackuswb(dst, dst, vtmp, vector_len);
+    vpermq(dst, dst, 0xD8, vector_len);
+  }
+}
+
+void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
+  switch(typ) {
+    case T_BYTE:
+      pinsrb(dst, val, idx);
+      break;
+    case T_SHORT:
+      pinsrw(dst, val, idx);
+      break;
+    case T_INT:
+      pinsrd(dst, val, idx);
+      break;
+    case T_LONG:
+      pinsrq(dst, val, idx);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
+void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
+  switch(typ) {
+    case T_BYTE:
+      vpinsrb(dst, src, val, idx);
+      break;
+    case T_SHORT:
+      vpinsrw(dst, src, val, idx);
+      break;
+    case T_INT:
+      vpinsrd(dst, src, val, idx);
+      break;
+    case T_LONG:
+      vpinsrq(dst, src, val, idx);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
+void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
+  switch(typ) {
+    case T_INT:
+      vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
+      break;
+    case T_FLOAT:
+      vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
+      break;
+    case T_LONG:
+      vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
+      break;
+    case T_DOUBLE:
+      vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
+void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
+  switch(typ) {
+    case T_INT:
+      evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
+      break;
+    case T_FLOAT:
+      evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
+      break;
+    case T_LONG:
+      evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
+      break;
+    case T_DOUBLE:
+      evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
+void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
+  switch(typ) {
+    case T_INT:
+      evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
+      break;
+    case T_FLOAT:
+      evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
+      break;
+    case T_LONG:
+      evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
+      break;
+    case T_DOUBLE:
+      evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
   }
 }
 
-void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  if (opcode == Op_RShiftVL) {
-    evpsraq(dst, nds, src, vector_len);
-  } else if (opcode == Op_LShiftVL) {
-    vpsllq(dst, nds, src, vector_len);
+void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
+  if (vlen_in_bytes <= 16) {
+    pxor (dst, dst);
+    psubb(dst, src);
+    switch (elem_bt) {
+      case T_BYTE:   /* nothing to do */ break;
+      case T_SHORT:  pmovsxbw(dst, dst); break;
+      case T_INT:    pmovsxbd(dst, dst); break;
+      case T_FLOAT:  pmovsxbd(dst, dst); break;
+      case T_LONG:   pmovsxbq(dst, dst); break;
+      case T_DOUBLE: pmovsxbq(dst, dst); break;
+
+      default: assert(false, "%s", type2name(elem_bt));
+    }
   } else {
-    assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
-    vpsrlq(dst, nds, src, vector_len);
+    int vlen_enc = vector_length_encoding(vlen_in_bytes);
+
+    vpxor (dst, dst, dst, vlen_enc);
+    vpsubb(dst, dst, src, vlen_enc);
+    switch (elem_bt) {
+      case T_BYTE:   /* nothing to do */            break;
+      case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
+      case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
+      case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
+      case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
+      case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
+
+      default: assert(false, "%s", type2name(elem_bt));
+    }
   }
 }
 
-// Reductions for vectors of ints, longs, floats, and doubles.
+void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
+  ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
+  if (vlen_in_bytes <= 16) {
+    movdqu(dst, addr, scratch);
+  } else if (vlen_in_bytes == 32) {
+    vmovdqu(dst, addr, scratch);
+  } else {
+    assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
+    evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
+  }
+}
+// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
 
-void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
+void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
   int vector_len = Assembler::AVX_128bit;
 
   switch (opcode) {
     case Op_AndReductionV:  pand(dst, src); break;
     case Op_OrReductionV:   por (dst, src); break;
     case Op_XorReductionV:  pxor(dst, src); break;
-
+    case Op_MinReductionV:
+      switch (typ) {
+        case T_BYTE:        pminsb(dst, src); break;
+        case T_SHORT:       pminsw(dst, src); break;
+        case T_INT:         pminsd(dst, src); break;
+        case T_LONG:        assert(UseAVX > 2, "required");
+                            vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
+    case Op_MaxReductionV:
+      switch (typ) {
+        case T_BYTE:        pmaxsb(dst, src); break;
+        case T_SHORT:       pmaxsw(dst, src); break;
+        case T_INT:         pmaxsd(dst, src); break;
+        case T_LONG:        assert(UseAVX > 2, "required");
+                            vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
     case Op_AddReductionVF: addss(dst, src); break;
     case Op_AddReductionVD: addsd(dst, src); break;
-    case Op_AddReductionVI: paddd(dst, src); break;
+    case Op_AddReductionVI:
+      switch (typ) {
+        case T_BYTE:        paddb(dst, src); break;
+        case T_SHORT:       paddw(dst, src); break;
+        case T_INT:         paddd(dst, src); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
     case Op_AddReductionVL: paddq(dst, src); break;
-
     case Op_MulReductionVF: mulss(dst, src); break;
     case Op_MulReductionVD: mulsd(dst, src); break;
-    case Op_MulReductionVI: pmulld(dst, src); break;
-    case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
-
+    case Op_MulReductionVI:
+      switch (typ) {
+        case T_SHORT:       pmullw(dst, src); break;
+        case T_INT:         pmulld(dst, src); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
+    case Op_MulReductionVL: assert(UseAVX > 2, "required");
+                            vpmullq(dst, dst, src, vector_len); break;
     default: assert(false, "wrong opcode");
   }
 }
 
-void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
+void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
   int vector_len = Assembler::AVX_256bit;
 
   switch (opcode) {
     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
-
-    case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
+    case Op_MinReductionV:
+      switch (typ) {
+        case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
+        case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
+        case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
+        case T_LONG:        assert(UseAVX > 2, "required");
+                            vpminsq(dst, src1, src2, vector_len); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
+    case Op_MaxReductionV:
+      switch (typ) {
+        case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
+        case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
+        case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
+        case T_LONG:        assert(UseAVX > 2, "required");
+                            vpmaxsq(dst, src1, src2, vector_len); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
+    case Op_AddReductionVI:
+      switch (typ) {
+        case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
+        case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
+        case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
-
-    case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
+    case Op_MulReductionVI:
+      switch (typ) {
+        case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
+        case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
+        default:            assert(false, "wrong type");
+      }
+      break;
     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
-
     default: assert(false, "wrong opcode");
   }
 }
 
 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,

@@ -994,10 +1496,49 @@
 
     default: assert(false, "wrong opcode");
   }
 }
 
+void C2_MacroAssembler::reduceB(int opcode, int vlen,
+                             Register dst, Register src1, XMMRegister src2,
+                             XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+
+    default: assert(false, "wrong vector length");
+  }
+}
+
+void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
+                             Register dst, Register src1, XMMRegister src2,
+                             XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+
+    default: assert(false, "wrong vector length");
+  }
+}
+
+void C2_MacroAssembler::reduceS(int opcode, int vlen,
+                             Register dst, Register src1, XMMRegister src2,
+                             XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+
+    default: assert(false, "wrong vector length");
+  }
+}
+
 void C2_MacroAssembler::reduceI(int opcode, int vlen,
                                 Register dst, Register src1, XMMRegister src2,
                                 XMMRegister vtmp1, XMMRegister vtmp2) {
   switch (vlen) {
     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;

@@ -1065,14 +1606,14 @@
       movdqu(vtmp1, src2);
     }
     phaddd(vtmp1, vtmp1);
   } else {
     pshufd(vtmp1, src2, 0x1);
-    reduce_operation_128(opcode, vtmp1, src2);
+    reduce_operation_128(T_INT, opcode, vtmp1, src2);
   }
   movdl(vtmp2, src1);
-  reduce_operation_128(opcode, vtmp1, vtmp2);
+  reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
   movdl(dst, vtmp1);
 }
 
 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
   if (opcode == Op_AddReductionVI) {

@@ -1081,11 +1622,11 @@
     }
     phaddd(vtmp1, src2);
     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
   } else {
     pshufd(vtmp2, src2, 0xE);
-    reduce_operation_128(opcode, vtmp2, src2);
+    reduce_operation_128(T_INT, opcode, vtmp2, src2);
     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
   }
 }
 
 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {

@@ -1094,55 +1635,180 @@
     vextracti128_high(vtmp2, vtmp1);
     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
   } else {
     vextracti128_high(vtmp1, src2);
-    reduce_operation_128(opcode, vtmp1, src2);
+    reduce_operation_128(T_INT, opcode, vtmp1, src2);
     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
   }
 }
 
 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
   vextracti64x4_high(vtmp2, src2);
-  reduce_operation_256(opcode, vtmp2, vtmp2, src2);
+  reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
 }
 
+void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  pshufd(vtmp2, src2, 0x1);
+  reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
+  movdqu(vtmp1, vtmp2);
+  psrldq(vtmp1, 2);
+  reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
+  movdqu(vtmp2, vtmp1);
+  psrldq(vtmp2, 1);
+  reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
+  movdl(vtmp2, src1);
+  pmovsxbd(vtmp1, vtmp1);
+  reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
+  pextrb(dst, vtmp1, 0x0);
+  movsbl(dst, dst);
+}
+
+void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  pshufd(vtmp1, src2, 0xE);
+  reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
+  reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  vextracti128_high(vtmp2, src2);
+  reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
+  reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  vextracti64x4_high(vtmp1, src2);
+  reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
+  reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  pmovsxbw(vtmp2, src2);
+  reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (UseAVX > 1) {
+    int vector_len = Assembler::AVX_256bit;
+    vpmovsxbw(vtmp1, src2, vector_len);
+    reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+  } else {
+    pmovsxbw(vtmp2, src2);
+    reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+    pshufd(vtmp2, src2, 0x1);
+    pmovsxbw(vtmp2, src2);
+    reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
+  }
+}
+
+void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
+    int vector_len = Assembler::AVX_512bit;
+    vpmovsxbw(vtmp1, src2, vector_len);
+    reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+  } else {
+    assert(UseAVX >= 2,"Should not reach here.");
+    mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
+    vextracti128_high(vtmp2, src2);
+    mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
+  }
+}
+
+void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
+  vextracti64x4_high(vtmp2, src2);
+  mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (opcode == Op_AddReductionVI) {
+    if (vtmp1 != src2) {
+      movdqu(vtmp1, src2);
+    }
+    phaddw(vtmp1, vtmp1);
+    phaddw(vtmp1, vtmp1);
+  } else {
+    pshufd(vtmp2, src2, 0x1);
+    reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
+    movdqu(vtmp1, vtmp2);
+    psrldq(vtmp1, 2);
+    reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
+  }
+  movdl(vtmp2, src1);
+  pmovsxwd(vtmp1, vtmp1);
+  reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
+  pextrw(dst, vtmp1, 0x0);
+  movswl(dst, dst);
+}
+
+void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (opcode == Op_AddReductionVI) {
+    if (vtmp1 != src2) {
+      movdqu(vtmp1, src2);
+    }
+    phaddw(vtmp1, src2);
+  } else {
+    pshufd(vtmp1, src2, 0xE);
+    reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
+  }
+  reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (opcode == Op_AddReductionVI) {
+    int vector_len = Assembler::AVX_256bit;
+    vphaddw(vtmp2, src2, src2, vector_len);
+    vpermq(vtmp2, vtmp2, 0xD8, vector_len);
+  } else {
+    vextracti128_high(vtmp2, src2);
+    reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
+  }
+  reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+}
+
+void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  int vector_len = Assembler::AVX_256bit;
+  vextracti64x4_high(vtmp1, src2);
+  reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
+  reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+}
+
 #ifdef _LP64
 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
   pshufd(vtmp2, src2, 0xE);
-  reduce_operation_128(opcode, vtmp2, src2);
+  reduce_operation_128(T_LONG, opcode, vtmp2, src2);
   movdq(vtmp1, src1);
-  reduce_operation_128(opcode, vtmp1, vtmp2);
+  reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
   movdq(dst, vtmp1);
 }
 
 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
   vextracti128_high(vtmp1, src2);
-  reduce_operation_128(opcode, vtmp1, src2);
+  reduce_operation_128(T_LONG, opcode, vtmp1, src2);
   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
 }
 
 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
   vextracti64x4_high(vtmp2, src2);
-  reduce_operation_256(opcode, vtmp2, vtmp2, src2);
+  reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
 }
 #endif // _LP64
 
 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
-  reduce_operation_128(opcode, dst, src);
+  reduce_operation_128(T_FLOAT, opcode, dst, src);
   pshufd(vtmp, src, 0x1);
-  reduce_operation_128(opcode, dst, vtmp);
+  reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
 }
 
 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
   reduce2F(opcode, dst, src, vtmp);
   pshufd(vtmp, src, 0x2);
-  reduce_operation_128(opcode, dst, vtmp);
+  reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
   pshufd(vtmp, src, 0x3);
-  reduce_operation_128(opcode, dst, vtmp);
+  reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
 }
 
 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
   reduce4F(opcode, dst, src, vtmp2);
   vextractf128_high(vtmp2, src);

@@ -1154,13 +1820,13 @@
   vextracti64x4_high(vtmp1, src);
   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
 }
 
 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
-  reduce_operation_128(opcode, dst, src);
+  reduce_operation_128(T_DOUBLE, opcode, dst, src);
   pshufd(vtmp, src, 0xE);
-  reduce_operation_128(opcode, dst, vtmp);
+  reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
 }
 
 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
   reduce2D(opcode, dst, src, vtmp2);
   vextractf128_high(vtmp2, src);

@@ -1171,10 +1837,211 @@
   reduce4D(opcode, dst, src, vtmp1, vtmp2);
   vextracti64x4_high(vtmp1, src);
   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
 }
 
+void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
+                                          XMMRegister dst, XMMRegister src,
+                                          XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
+                                          XMMRegister xmm_0, XMMRegister xmm_1) {
+  int permconst[] = {1, 14};
+  XMMRegister wsrc = src;
+  XMMRegister wdst = xmm_0;
+  XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
+
+  int vlen_enc = Assembler::AVX_128bit;
+  if (vlen == 16) {
+    vlen_enc = Assembler::AVX_256bit;
+  }
+
+  for (int i = log2(vlen) - 1; i >=0; i--) {
+    if (i == 0 && !is_dst_valid) {
+      wdst = dst;
+    }
+    if (i == 3) {
+      vextracti64x4_high(wtmp, wsrc);
+    } else if (i == 2) {
+      vextracti128_high(wtmp, wsrc);
+    } else { // i = [0,1]
+      vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
+    }
+    vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
+    wsrc = wdst;
+    vlen_enc = Assembler::AVX_128bit;
+  }
+  if (is_dst_valid) {
+    vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
+  }
+}
+
+void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
+                                        XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
+                                        XMMRegister xmm_0, XMMRegister xmm_1) {
+  XMMRegister wsrc = src;
+  XMMRegister wdst = xmm_0;
+  XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
+  int vlen_enc = Assembler::AVX_128bit;
+  if (vlen == 8) {
+    vlen_enc = Assembler::AVX_256bit;
+  }
+  for (int i = log2(vlen) - 1; i >=0; i--) {
+    if (i == 0 && !is_dst_valid) {
+      wdst = dst;
+    }
+    if (i == 1) {
+      vextracti128_high(wtmp, wsrc);
+    } else if (i == 2) {
+      vextracti64x4_high(wtmp, wsrc);
+    } else {
+      assert(i == 0, "%d", i);
+      vpermilpd(wtmp, wsrc, 1, vlen_enc);
+    }
+    vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
+    wsrc = wdst;
+    vlen_enc = Assembler::AVX_128bit;
+  }
+  if (is_dst_valid) {
+    vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
+  }
+}
+
+void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
+  switch (bt) {
+    case T_BYTE:  pextrb(dst, src, idx); break;
+    case T_SHORT: pextrw(dst, src, idx); break;
+    case T_INT:   pextrd(dst, src, idx); break;
+    case T_LONG:  pextrq(dst, src, idx); break;
+
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
+XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
+  int esize =  type2aelembytes(typ);
+  int elem_per_lane = 16/esize;
+  int lane = elemindex / elem_per_lane;
+  int eindex = elemindex % elem_per_lane;
+
+  if (lane >= 2) {
+    assert(UseAVX > 2, "required");
+    vextractf32x4(dst, src, lane & 3);
+    return dst;
+  } else if (lane > 0) {
+    assert(UseAVX > 0, "required");
+    vextractf128(dst, src, lane);
+    return dst;
+  } else {
+    return src;
+  }
+}
+
+void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
+  int esize =  type2aelembytes(typ);
+  int elem_per_lane = 16/esize;
+  int eindex = elemindex % elem_per_lane;
+  assert(is_integral_type(typ),"required");
+
+  if (eindex == 0) {
+    if (typ == T_LONG) {
+      movq(dst, src);
+    } else {
+      movdl(dst, src);
+      if (typ == T_BYTE)
+        movsbl(dst, dst);
+      else if (typ == T_SHORT)
+        movswl(dst, dst);
+    }
+  } else {
+    extract(typ, dst, src, eindex);
+  }
+}
+
+void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
+  int esize =  type2aelembytes(typ);
+  int elem_per_lane = 16/esize;
+  int eindex = elemindex % elem_per_lane;
+  assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
+
+  if (eindex == 0) {
+    movq(dst, src);
+  } else {
+    if (typ == T_FLOAT) {
+      if (UseAVX == 0) {
+        movdqu(dst, src);
+        pshufps(dst, dst, eindex);
+      } else {
+        vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
+      }
+    } else {
+      if (UseAVX == 0) {
+        movdqu(dst, src);
+        psrldq(dst, eindex*esize);
+      } else {
+        vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
+      }
+      movq(dst, dst);
+    }
+  }
+  // Zero upper bits
+  if (typ == T_FLOAT) {
+    if (UseAVX == 0) {
+      assert((vtmp != xnoreg) && (tmp != noreg), "required.");
+      movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
+      pand(dst, vtmp);
+    } else {
+      assert((tmp != noreg), "required.");
+      vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
+    }
+  }
+}
+
+void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
+  switch(typ) {
+    case T_BYTE:
+      evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
+      break;
+    case T_SHORT:
+      evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
+      break;
+    case T_INT:
+    case T_FLOAT:
+      evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
+      break;
+    case T_LONG:
+    case T_DOUBLE:
+      evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
+void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
+  switch(typ) {
+    case T_BYTE:
+      evpblendmb(dst, kmask, src1, src2, merge, vector_len);
+      break;
+    case T_SHORT:
+      evpblendmw(dst, kmask, src1, src2, merge, vector_len);
+      break;
+    case T_INT:
+    case T_FLOAT:
+      evpblendmd(dst, kmask, src1, src2, merge, vector_len);
+      break;
+    case T_LONG:
+    case T_DOUBLE:
+      evpblendmq(dst, kmask, src1, src2, merge, vector_len);
+      break;
+    default:
+      assert(false,"Should not reach here.");
+      break;
+  }
+}
+
 //-------------------------------------------------------------------------------------------
 
 // IndexOf for constant substrings with size >= 8 chars
 // which don't need to be loaded through stack.
 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
< prev index next >