1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1071 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1072 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1073 
1074 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1075 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1076 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1077 
1078 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1079 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1080 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1081 
1082 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1083 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1084 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1085 
1086 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1087 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1088 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1089 
1090 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1091 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1092 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1093 
1094 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1095 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1096 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1097 
1098 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1099 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1100 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1101 
1102 #ifdef _LP64
1103 
1104 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1105 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1106 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1107 
1108 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1109 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1110 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1111 
1112 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1113 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1114 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1115 
1116 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1117 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1118 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1119 
1120 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1121 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1122 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1123 
1124 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1125 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1126 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1127 
1128 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1129 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1130 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1131 
1132 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1133 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1134 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1135 
1136 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1137 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1138 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1139 
1140 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1141 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1142 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1143 
1144 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1145 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1146 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1147 
1148 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1149 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1150 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1151 
1152 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1153 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1154 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1155 
1156 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1157 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1158 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1159 
1160 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1161 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1162 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1163 
1164 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1165 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1166 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1167 
1168 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1169 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1170 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1171 
1172 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1173 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1174 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1175 
1176 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1177 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1178 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1179 
1180 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1181 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1182 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1183 
1184 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1185 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1186 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1187 
1188 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1189 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1190 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1191 
1192 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1193 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1194 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1195 
1196 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1197 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1198 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1199 
1200 #endif
1201 
1202 %}
1203 
1204 
1205 //----------SOURCE BLOCK-------------------------------------------------------
1206 // This is a block of C++ code which provides values, functions, and
1207 // definitions necessary in the rest of the architecture description
1208 
1209 source_hpp %{
1210 // Header information of the source block.
1211 // Method declarations/definitions which are used outside
1212 // the ad-scope can conveniently be defined here.
1213 //
1214 // To keep related declarations/definitions/uses close together,
1215 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1216 
1217 class NativeJump;
1218 
1219 class CallStubImpl {
1220 
1221   //--------------------------------------------------------------
1222   //---<  Used for optimization in Compile::shorten_branches  >---
1223   //--------------------------------------------------------------
1224 
1225  public:
1226   // Size of call trampoline stub.
1227   static uint size_call_trampoline() {
1228     return 0; // no call trampolines on this platform
1229   }
1230 
1231   // number of relocations needed by a call trampoline stub
1232   static uint reloc_call_trampoline() {
1233     return 0; // no call trampolines on this platform
1234   }
1235 };
1236 
1237 class HandlerImpl {
1238 
1239  public:
1240 
1241   static int emit_exception_handler(CodeBuffer &cbuf);
1242   static int emit_deopt_handler(CodeBuffer& cbuf);
1243 
1244   static uint size_exception_handler() {
1245     // NativeCall instruction size is the same as NativeJump.
1246     // exception handler starts out as jump and can be patched to
1247     // a call be deoptimization.  (4932387)
1248     // Note that this value is also credited (in output.cpp) to
1249     // the size of the code section.
1250     return NativeJump::instruction_size;
1251   }
1252 
1253 #ifdef _LP64
1254   static uint size_deopt_handler() {
1255     // three 5 byte instructions plus one move for unreachable address.
1256     return 15+3;
1257   }
1258 #else
1259   static uint size_deopt_handler() {
1260     // NativeCall instruction size is the same as NativeJump.
1261     // exception handler starts out as jump and can be patched to
1262     // a call be deoptimization.  (4932387)
1263     // Note that this value is also credited (in output.cpp) to
1264     // the size of the code section.
1265     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1266   }
1267 #endif
1268 };
1269 
1270 %} // end source_hpp
1271 
1272 source %{
1273 
1274 #include "opto/addnode.hpp"
1275 
1276 // Emit exception handler code.
1277 // Stuff framesize into a register and call a VM stub routine.
1278 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1279 
1280   // Note that the code buffer's insts_mark is always relative to insts.
1281   // That's why we must use the macroassembler to generate a handler.
1282   MacroAssembler _masm(&cbuf);
1283   address base = __ start_a_stub(size_exception_handler());
1284   if (base == NULL) {
1285     ciEnv::current()->record_failure("CodeCache is full");
1286     return 0;  // CodeBuffer::expand failed
1287   }
1288   int offset = __ offset();
1289   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1290   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1291   __ end_a_stub();
1292   return offset;
1293 }
1294 
1295 // Emit deopt handler code.
1296 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1297 
1298   // Note that the code buffer's insts_mark is always relative to insts.
1299   // That's why we must use the macroassembler to generate a handler.
1300   MacroAssembler _masm(&cbuf);
1301   address base = __ start_a_stub(size_deopt_handler());
1302   if (base == NULL) {
1303     ciEnv::current()->record_failure("CodeCache is full");
1304     return 0;  // CodeBuffer::expand failed
1305   }
1306   int offset = __ offset();
1307 
1308 #ifdef _LP64
1309   address the_pc = (address) __ pc();
1310   Label next;
1311   // push a "the_pc" on the stack without destroying any registers
1312   // as they all may be live.
1313 
1314   // push address of "next"
1315   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1316   __ bind(next);
1317   // adjust it so it matches "the_pc"
1318   __ subptr(Address(rsp, 0), __ offset() - offset);
1319 #else
1320   InternalAddress here(__ pc());
1321   __ pushptr(here.addr());
1322 #endif
1323 
1324   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1325   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1326   __ end_a_stub();
1327   return offset;
1328 }
1329 
1330 
1331 //=============================================================================
1332 
1333   // Float masks come from different places depending on platform.
1334 #ifdef _LP64
1335   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1336   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1337   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1338   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1339   static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1340   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1341   static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1342   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1343   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1344   static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); }
1345   static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); }
1346   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1347   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1348   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1349   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1350   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1351   static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); }
1352   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1353   static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); }
1354   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1355   static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); }
1356   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1357   static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); }
1358 #else
1359   static address float_signmask()  { return (address)float_signmask_pool; }
1360   static address float_signflip()  { return (address)float_signflip_pool; }
1361   static address double_signmask() { return (address)double_signmask_pool; }
1362   static address double_signflip() { return (address)double_signflip_pool; }
1363 #endif
1364 
1365 
1366 const bool Matcher::match_rule_supported(int opcode) {
1367   if (!has_match_rule(opcode))
1368     return false;
1369 
1370   bool ret_value = true;
1371   switch (opcode) {
1372     case Op_PopCountI:
1373     case Op_PopCountL:
1374       if (!UsePopCountInstruction)
1375         ret_value = false;
1376       break;
1377     case Op_PopCountVI:
1378       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1379         ret_value = false;
1380       break;
1381     case Op_MulVI:
1382     case Op_MulVL:
1383       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1384         ret_value = false;
1385       break;
1386     case Op_MulReductionVL:
1387       if (VM_Version::supports_avx512dq() == false)
1388         ret_value = false;
1389       break;
1390     case Op_AddReductionVL:
1391       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1392         ret_value = false;
1393       break;
1394     case Op_MulReductionVI:
1395       if (UseSSE < 4) // requires at least SSE4
1396         ret_value = false;
1397       break;
1398     case Op_AddReductionVF:
1399     case Op_AddReductionVD:
1400     case Op_MulReductionVF:
1401     case Op_MulReductionVD:
1402       if (UseSSE < 1) // requires at least SSE
1403         ret_value = false;
1404       break;
1405     case Op_SqrtVD:
1406     case Op_SqrtVF:
1407       if (UseAVX < 1) // enabled for AVX only
1408         ret_value = false;
1409       break;
1410     case Op_CompareAndSwapL:
1411 #ifdef _LP64
1412     case Op_CompareAndSwapP:
1413 #endif
1414       if (!VM_Version::supports_cx8())
1415         ret_value = false;
1416       break;
1417     case Op_CMoveVF:
1418     case Op_CMoveVD:
1419       if (UseAVX < 1 || UseAVX > 2)
1420         ret_value = false;
1421       break;
1422     case Op_StrIndexOf:
1423       if (!UseSSE42Intrinsics)
1424         ret_value = false;
1425       break;
1426     case Op_StrIndexOfChar:
1427       if (!UseSSE42Intrinsics)
1428         ret_value = false;
1429       break;
1430     case Op_OnSpinWait:
1431       if (VM_Version::supports_on_spin_wait() == false)
1432         ret_value = false;
1433       break;
1434   }
1435 
1436   return ret_value;  // Per default match rules are supported.
1437 }
1438 
1439 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt, int op_arity) {
1440   // identify extra cases that we might want to provide match rules for
1441   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1442   bool ret_value = match_rule_supported(opcode);
1443   if (ret_value) {
1444     int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1445     if (!vector_size_supported(bt, vlen)) {
1446       ret_value = false;
1447     } else if (size_in_bits > 256 && UseAVX <= 2) {
1448       // Only AVX512 supports 512-bit vectors
1449       ret_value = false;
1450     } else if (UseAVX == 0 && size_in_bits > 128) {
1451       // Only AVX supports 256-bit vectors
1452       ret_value = false;
1453     } else if (is_subword_type(bt) && size_in_bits == 512 && VM_Version::supports_avx512bw() == false) {
1454       // Byte and Short types are not supported in AVX512 if AVX512BW is not true.
1455       ret_value = false;
1456     } else {
1457         switch (opcode) {
1458         case Op_AbsV:
1459           if (is_integral_type(bt) && UseSSE < 3) { ret_value = false; }
1460           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1461           else if (bt == T_LONG && UseAVX <= 2) { ret_value = false; } // Implementation limitation
1462           break;
1463         case Op_AddVB:
1464         case Op_SubVB:
1465           if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1466             ret_value = false;
1467           break;
1468         case Op_MaxV:
1469         case Op_MinV:
1470           if (UseSSE < 4 && (bt == T_BYTE || bt == T_INT || bt == T_LONG))
1471             ret_value = false;
1472           break;
1473         case Op_MulVB:
1474           if (size_in_bits <= 128 && UseSSE < 4) { ret_value = false; }
1475           else if (size_in_bits > 256 && UseAVX < 2) { ret_value = false; }
1476           break;
1477         case Op_LShiftVI:
1478         case Op_RShiftVI:
1479         case Op_URShiftVI:
1480           if (op_arity == 2 && UseAVX <= 1)
1481             ret_value  = false;
1482           break;
1483         case Op_LShiftVL:
1484         case Op_RShiftVL:
1485         case Op_URShiftVL:
1486           if (op_arity == 2 && UseAVX <= 1)
1487             ret_value  = false;
1488           break;
1489         case Op_URShiftVS:
1490         case Op_RShiftVS:
1491         case Op_LShiftVS:
1492         case Op_MulVS:
1493         case Op_AddVS:
1494         case Op_SubVS:
1495           if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1496             ret_value = false;
1497           break;
1498         case Op_CMoveVF:
1499           if (vlen != 8)
1500             ret_value  = false;
1501           break;
1502         case Op_CMoveVD:
1503           if (vlen != 4)
1504             ret_value  = false;
1505           break;
1506         case Op_AddReductionVI:
1507           if (bt == T_INT && UseSSE < 3) { ret_value = false; }
1508           else if (is_subword_type(bt) && UseSSE <= 3) { ret_value = false; }
1509           break;
1510         case Op_AndReductionV:
1511         case Op_OrReductionV:
1512         case Op_XorReductionV:
1513           if (bt == T_BYTE && UseSSE <= 3) { ret_value = false; }
1514           break;
1515         case Op_VectorMaskCmp:
1516           if (UseAVX <= 0) { ret_value = false; }
1517           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1518           break;
1519         case Op_MinReductionV:
1520         case Op_MaxReductionV:
1521           if ((bt == T_INT || bt == T_LONG || bt == T_BYTE) && UseSSE <= 3) { ret_value = false; }
1522           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1523           break;
1524         case Op_VectorBlend:
1525           if (UseSSE <= 3 && UseAVX == 0) { ret_value = false; }
1526           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1527           break;
1528         case Op_VectorTest:
1529           if (UseAVX <= 0) { ret_value = false; }
1530           else if (size_in_bits != 128 && size_in_bits != 256) { ret_value = false; } // Implementation limitation
1531           break;
1532         case Op_VectorLoadMask:
1533           if (UseSSE <= 3) { ret_value = false; }
1534           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1535           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
1536           break;
1537         case Op_VectorLoadShuffle:
1538         case Op_VectorRearrange:
1539           if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation due to how shuffle is loaded
1540           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
1541           else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512vbmi())  { ret_value = false; } // Implementation limitation
1542           else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512vlbw())  { ret_value = false; } // Implementation limitation
1543           break;
1544         case Op_VectorStoreMask:
1545           if (UseAVX < 0) { ret_value = false; } // Implementation limitation
1546           else if ((size_in_bits >= 256 || bt == T_LONG || bt == T_DOUBLE) && UseAVX < 2) { ret_value = false; } // Implementation limitation
1547           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1548           else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation
1549           break;
1550         case Op_VectorCastB2X:
1551           if (UseAVX <= 0) { ret_value = false; }
1552           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; }
1553           break;
1554         case Op_VectorCastS2X:
1555           if (UseAVX <= 0) { ret_value = false; }
1556           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1557           else if (is_integral_type(bt) && vlen * type2aelembytes(T_SHORT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1558           break;
1559         case Op_VectorCastI2X:
1560           if (UseAVX <= 0) { ret_value = false; }
1561           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1562           else if (is_integral_type(bt) && vlen * type2aelembytes(T_INT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1563           break;
1564         case Op_VectorCastL2X:
1565           if (UseAVX <= 0) { ret_value = false; }
1566           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1567           else if (is_integral_type(bt) && vlen * type2aelembytes(T_LONG) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1568           else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) { ret_value = false; }
1569           break;
1570         case Op_VectorCastF2X:
1571           // Casts from FP to integral types require special fixup logic not easily
1572           // implementable with vectors.
1573           if (UseAVX <= 0) { ret_value = false; }
1574           else if (bt != T_DOUBLE) { ret_value = false; } // Implementation limitation
1575           break;
1576         case Op_VectorCastD2X:
1577           // Casts from FP to integral types require special fixup logic not easily
1578           // implementable with vectors.
1579           if (UseAVX <= 0) { ret_value = false; }
1580           else if (bt != T_FLOAT) { ret_value = false; } // Implementation limitation
1581           break;
1582         case Op_VectorReinterpret:
1583           if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; }
1584           break;
1585         case Op_MulReductionVI:
1586           if (bt ==T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; }
1587           break;
1588         default:
1589           break;
1590       }
1591     }
1592   }
1593   if (ret_value) {
1594     assert(is_java_primitive(bt) && (vlen > 0) && is_power_of_2(vlen) &&
1595            vector_size_supported(bt, vlen), "must be supported");
1596   }
1597 
1598   return ret_value;  // Per default match rules are supported.
1599 }
1600 
1601 const bool Matcher::has_predicated_vectors(void) {
1602   bool ret_value = false;
1603   if (UseAVX > 2) {
1604     ret_value = VM_Version::supports_avx512vl();
1605   }
1606 
1607   return ret_value;
1608 }
1609 
1610 const int Matcher::float_pressure(int default_pressure_threshold) {
1611   int float_pressure_threshold = default_pressure_threshold;
1612 #ifdef _LP64
1613   if (UseAVX > 2) {
1614     // Increase pressure threshold on machines with AVX3 which have
1615     // 2x more XMM registers.
1616     float_pressure_threshold = default_pressure_threshold * 2;
1617   }
1618 #endif
1619   return float_pressure_threshold;
1620 }
1621 
1622 // Max vector size in bytes. 0 if not supported.
1623 const int Matcher::vector_width_in_bytes(BasicType bt) {
1624   assert(is_java_primitive(bt), "only primitive type vectors");
1625   if (UseSSE < 2) return 0;
1626   // SSE2 supports 128bit vectors for all types.
1627   // AVX2 supports 256bit vectors for all types.
1628   // AVX2/EVEX supports 512bit vectors for all types.
1629   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1630   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1631   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1632     size = (UseAVX > 2) ? 64 : 32;
1633   // Use flag to limit vector size.
1634   size = MIN2(size,(int)MaxVectorSize);
1635   // Minimum 2 values in vector (or 4 for bytes).
1636   switch (bt) {
1637   case T_DOUBLE:
1638   case T_LONG:
1639     if (size < 16) return 0;
1640     break;
1641   case T_FLOAT:
1642   case T_INT:
1643     if (size < 8) return 0;
1644     break;
1645   case T_BOOLEAN:
1646     if (size < 4) return 0;
1647     break;
1648   case T_CHAR:
1649     if (size < 4) return 0;
1650     break;
1651   case T_BYTE:
1652     if (size < 4) return 0;
1653     break;
1654   case T_SHORT:
1655     if (size < 4) return 0;
1656     break;
1657   default:
1658     ShouldNotReachHere();
1659   }
1660   return size;
1661 }
1662 
1663 // Limits on vector size (number of elements) loaded into vector.
1664 const int Matcher::max_vector_size(const BasicType bt) {
1665   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1666 }
1667 const int Matcher::min_vector_size(const BasicType bt) {
1668   int max_size = max_vector_size(bt);
1669   // Min size which can be loaded into vector is 4 bytes.
1670   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1671   return MIN2(size,max_size);
1672 }
1673 
1674 // Vector ideal reg corresponding to specidied size in bytes
1675 const uint Matcher::vector_ideal_reg(int size) {
1676   assert(MaxVectorSize >= size, "");
1677   switch(size) {
1678     case  4: return Op_VecS;
1679     case  8: return Op_VecD;
1680     case 16: return Op_VecX;
1681     case 32: return Op_VecY;
1682     case 64: return Op_VecZ;
1683   }
1684   ShouldNotReachHere();
1685   return 0;
1686 }
1687 
1688 // Only lowest bits of xmm reg are used for vector shift count.
1689 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1690   return Op_VecS;
1691 }
1692 
1693 // x86 supports misaligned vectors store/load.
1694 const bool Matcher::misaligned_vectors_ok() {
1695   return !AlignVector; // can be changed by flag
1696 }
1697 
1698 // x86 AES instructions are compatible with SunJCE expanded
1699 // keys, hence we do not need to pass the original key to stubs
1700 const bool Matcher::pass_original_key_for_aes() {
1701   return false;
1702 }
1703 
1704 
1705 const bool Matcher::convi2l_type_required = true;
1706 
1707 // Check for shift by small constant as well
1708 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1709   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1710       shift->in(2)->get_int() <= 3 &&
1711       // Are there other uses besides address expressions?
1712       !matcher->is_visited(shift)) {
1713     address_visited.set(shift->_idx); // Flag as address_visited
1714     mstack.push(shift->in(2), Matcher::Visit);
1715     Node *conv = shift->in(1);
1716 #ifdef _LP64
1717     // Allow Matcher to match the rule which bypass
1718     // ConvI2L operation for an array index on LP64
1719     // if the index value is positive.
1720     if (conv->Opcode() == Op_ConvI2L &&
1721         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1722         // Are there other uses besides address expressions?
1723         !matcher->is_visited(conv)) {
1724       address_visited.set(conv->_idx); // Flag as address_visited
1725       mstack.push(conv->in(1), Matcher::Pre_Visit);
1726     } else
1727 #endif
1728       mstack.push(conv, Matcher::Pre_Visit);
1729     return true;
1730   }
1731   return false;
1732 }
1733 
1734 // Should the Matcher clone shifts on addressing modes, expecting them
1735 // to be subsumed into complex addressing expressions or compute them
1736 // into registers?
1737 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1738   Node *off = m->in(AddPNode::Offset);
1739   if (off->is_Con()) {
1740     address_visited.test_set(m->_idx); // Flag as address_visited
1741     Node *adr = m->in(AddPNode::Address);
1742 
1743     // Intel can handle 2 adds in addressing mode
1744     // AtomicAdd is not an addressing expression.
1745     // Cheap to find it by looking for screwy base.
1746     if (adr->is_AddP() &&
1747         !adr->in(AddPNode::Base)->is_top() &&
1748         // Are there other uses besides address expressions?
1749         !is_visited(adr)) {
1750       address_visited.set(adr->_idx); // Flag as address_visited
1751       Node *shift = adr->in(AddPNode::Offset);
1752       if (!clone_shift(shift, this, mstack, address_visited)) {
1753         mstack.push(shift, Pre_Visit);
1754       }
1755       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1756       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1757     } else {
1758       mstack.push(adr, Pre_Visit);
1759     }
1760 
1761     // Clone X+offset as it also folds into most addressing expressions
1762     mstack.push(off, Visit);
1763     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1764     return true;
1765   } else if (clone_shift(off, this, mstack, address_visited)) {
1766     address_visited.test_set(m->_idx); // Flag as address_visited
1767     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1768     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1769     return true;
1770   }
1771   return false;
1772 }
1773 
1774 void Compile::reshape_address(AddPNode* addp) {
1775 }
1776 
1777 // Helper methods for MachSpillCopyNode::implementation().
1778 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1779                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1780   // In 64-bit VM size calculation is very complex. Emitting instructions
1781   // into scratch buffer is used to get size in 64-bit VM.
1782   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1783   assert(ireg == Op_VecS || // 32bit vector
1784          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1785          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1786          "no non-adjacent vector moves" );
1787   if (cbuf) {
1788     MacroAssembler _masm(cbuf);
1789     int offset = __ offset();
1790     switch (ireg) {
1791     case Op_VecS: // copy whole register
1792     case Op_VecD:
1793     case Op_VecX:
1794       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1795       break;
1796     case Op_VecY:
1797       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1798       break;
1799     case Op_VecZ:
1800       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1801       break;
1802     default:
1803       ShouldNotReachHere();
1804     }
1805     int size = __ offset() - offset;
1806 #ifdef ASSERT
1807     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1808     assert(!do_size || size == 4, "incorrect size calculattion");
1809 #endif
1810     return size;
1811 #ifndef PRODUCT
1812   } else if (!do_size) {
1813     switch (ireg) {
1814     case Op_VecS:
1815     case Op_VecD:
1816     case Op_VecX:
1817       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1818       break;
1819     case Op_VecY:
1820     case Op_VecZ:
1821       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1822       break;
1823     default:
1824       ShouldNotReachHere();
1825     }
1826 #endif
1827   }
1828   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1829   return (UseAVX > 2) ? 6 : 4;
1830 }
1831 
1832 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1833                             int stack_offset, int reg, uint ireg, outputStream* st) {
1834   // In 64-bit VM size calculation is very complex. Emitting instructions
1835   // into scratch buffer is used to get size in 64-bit VM.
1836   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1837   if (cbuf) {
1838     MacroAssembler _masm(cbuf);
1839     int offset = __ offset();
1840     if (is_load) {
1841       switch (ireg) {
1842       case Op_VecS:
1843         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1844         break;
1845       case Op_VecD:
1846         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1847         break;
1848       case Op_VecX:
1849         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1850         break;
1851       case Op_VecY:
1852         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1853         break;
1854       case Op_VecZ:
1855         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1856         break;
1857       default:
1858         ShouldNotReachHere();
1859       }
1860     } else { // store
1861       switch (ireg) {
1862       case Op_VecS:
1863         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1864         break;
1865       case Op_VecD:
1866         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1867         break;
1868       case Op_VecX:
1869         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1870         break;
1871       case Op_VecY:
1872         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1873         break;
1874       case Op_VecZ:
1875         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1876         break;
1877       default:
1878         ShouldNotReachHere();
1879       }
1880     }
1881     int size = __ offset() - offset;
1882 #ifdef ASSERT
1883     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1884     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1885     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1886 #endif
1887     return size;
1888 #ifndef PRODUCT
1889   } else if (!do_size) {
1890     if (is_load) {
1891       switch (ireg) {
1892       case Op_VecS:
1893         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1894         break;
1895       case Op_VecD:
1896         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1897         break;
1898        case Op_VecX:
1899         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1900         break;
1901       case Op_VecY:
1902       case Op_VecZ:
1903         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1904         break;
1905       default:
1906         ShouldNotReachHere();
1907       }
1908     } else { // store
1909       switch (ireg) {
1910       case Op_VecS:
1911         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1912         break;
1913       case Op_VecD:
1914         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1915         break;
1916        case Op_VecX:
1917         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1918         break;
1919       case Op_VecY:
1920       case Op_VecZ:
1921         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1922         break;
1923       default:
1924         ShouldNotReachHere();
1925       }
1926     }
1927 #endif
1928   }
1929   bool is_single_byte = false;
1930   int vec_len = 0;
1931   if ((UseAVX > 2) && (stack_offset != 0)) {
1932     int tuple_type = Assembler::EVEX_FVM;
1933     int input_size = Assembler::EVEX_32bit;
1934     switch (ireg) {
1935     case Op_VecS:
1936       tuple_type = Assembler::EVEX_T1S;
1937       break;
1938     case Op_VecD:
1939       tuple_type = Assembler::EVEX_T1S;
1940       input_size = Assembler::EVEX_64bit;
1941       break;
1942     case Op_VecX:
1943       break;
1944     case Op_VecY:
1945       vec_len = 1;
1946       break;
1947     case Op_VecZ:
1948       vec_len = 2;
1949       break;
1950     }
1951     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1952   }
1953   int offset_size = 0;
1954   int size = 5;
1955   if (UseAVX > 2 ) {
1956     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1957       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1958       size += 2; // Need an additional two bytes for EVEX encoding
1959     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1960       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1961     } else {
1962       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1963       size += 2; // Need an additional two bytes for EVEX encodding
1964     }
1965   } else {
1966     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1967   }
1968   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1969   return size+offset_size;
1970 }
1971 
1972 static inline jint replicate4_imm(int con, int width) {
1973   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1974   assert(width == 1 || width == 2, "only byte or short types here");
1975   int bit_width = width * 8;
1976   jint val = con;
1977   val &= (1 << bit_width) - 1;  // mask off sign bits
1978   while(bit_width < 32) {
1979     val |= (val << bit_width);
1980     bit_width <<= 1;
1981   }
1982   return val;
1983 }
1984 
1985 static inline jlong replicate8_imm(int con, int width) {
1986   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1987   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1988   int bit_width = width * 8;
1989   jlong val = con;
1990   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1991   while(bit_width < 64) {
1992     val |= (val << bit_width);
1993     bit_width <<= 1;
1994   }
1995   return val;
1996 }
1997 
1998 #ifndef PRODUCT
1999   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2000     st->print("nop \t# %d bytes pad for loops and calls", _count);
2001   }
2002 #endif
2003 
2004   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2005     MacroAssembler _masm(&cbuf);
2006     __ nop(_count);
2007   }
2008 
2009   uint MachNopNode::size(PhaseRegAlloc*) const {
2010     return _count;
2011   }
2012 
2013 #ifndef PRODUCT
2014   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2015     st->print("# breakpoint");
2016   }
2017 #endif
2018 
2019   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2020     MacroAssembler _masm(&cbuf);
2021     __ int3();
2022   }
2023 
2024   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2025     return MachNode::size(ra_);
2026   }
2027 
2028 %}
2029 
2030 encode %{
2031 
2032   enc_class call_epilog %{
2033     if (VerifyStackAtCalls) {
2034       // Check that stack depth is unchanged: find majik cookie on stack
2035       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2036       MacroAssembler _masm(&cbuf);
2037       Label L;
2038       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2039       __ jccb(Assembler::equal, L);
2040       // Die if stack mismatch
2041       __ int3();
2042       __ bind(L);
2043     }
2044   %}
2045 
2046 %}
2047 
2048 
2049 //----------OPERANDS-----------------------------------------------------------
2050 // Operand definitions must precede instruction definitions for correct parsing
2051 // in the ADLC because operands constitute user defined types which are used in
2052 // instruction definitions.
2053 
2054 operand immU1() %{
2055   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(1));
2056   match(ConI);
2057 
2058   op_cost(0);
2059   format %{ %}
2060   interface(CONST_INTER);
2061 %}
2062 
2063 operand immU2() %{
2064   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(2));
2065   match(ConI);
2066 
2067   op_cost(0);
2068   format %{ %}
2069   interface(CONST_INTER);
2070 %}
2071 
2072 operand immU3() %{
2073   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(3));
2074   match(ConI);
2075 
2076   op_cost(0);
2077   format %{ %}
2078   interface(CONST_INTER);
2079 %}
2080 
2081 operand immU4() %{
2082   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(4));
2083   match(ConI);
2084 
2085   op_cost(0);
2086   format %{ %}
2087   interface(CONST_INTER);
2088 %}
2089 
2090 operand immU5() %{
2091   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(5));
2092   match(ConI);
2093 
2094   op_cost(0);
2095   format %{ %}
2096   interface(CONST_INTER);
2097 %}
2098 
2099 operand immU6() %{
2100   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(6));
2101   match(ConI);
2102 
2103   op_cost(0);
2104   format %{ %}
2105   interface(CONST_INTER);
2106 %}
2107 
2108 // This one generically applies only for evex, so only one version
2109 operand vecZ() %{
2110   constraint(ALLOC_IN_RC(vectorz_reg));
2111   match(VecZ);
2112 
2113   format %{ %}
2114   interface(REG_INTER);
2115 %}
2116 
2117 // Comparison Code for FP conditional move
2118 operand cmpOp_vcmppd() %{
2119   match(Bool);
2120 
2121   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2122             n->as_Bool()->_test._test != BoolTest::no_overflow);
2123   format %{ "" %}
2124   interface(COND_INTER) %{
2125     equal        (0x0, "eq");
2126     less         (0x1, "lt");
2127     less_equal   (0x2, "le");
2128     not_equal    (0xC, "ne");
2129     greater_equal(0xD, "ge");
2130     greater      (0xE, "gt");
2131     //TODO cannot compile (adlc breaks) without two next lines with error:
2132     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2133     // equal' for overflow.
2134     overflow     (0x20, "o");  // not really supported by the instruction
2135     no_overflow  (0x21, "no"); // not really supported by the instruction
2136   %}
2137 %}
2138 
2139 
2140 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2141 
2142 // ============================================================================
2143 
2144 instruct ShouldNotReachHere() %{
2145   match(Halt);
2146   format %{ "ud2\t# ShouldNotReachHere" %}
2147   ins_encode %{
2148     __ ud2();
2149   %}
2150   ins_pipe(pipe_slow);
2151 %}
2152 
2153 // =================================EVEX special===============================
2154 
2155 instruct setMask(rRegI dst, rRegI src) %{
2156   predicate(Matcher::has_predicated_vectors());
2157   match(Set dst (SetVectMaskI  src));
2158   effect(TEMP dst);
2159   format %{ "setvectmask   $dst, $src" %}
2160   ins_encode %{
2161     __ setvectmask($dst$$Register, $src$$Register);
2162   %}
2163   ins_pipe(pipe_slow);
2164 %}
2165 
2166 // ============================================================================
2167 
2168 instruct addF_reg(regF dst, regF src) %{
2169   predicate((UseSSE>=1) && (UseAVX == 0));
2170   match(Set dst (AddF dst src));
2171 
2172   format %{ "addss   $dst, $src" %}
2173   ins_cost(150);
2174   ins_encode %{
2175     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2176   %}
2177   ins_pipe(pipe_slow);
2178 %}
2179 
2180 instruct addF_mem(regF dst, memory src) %{
2181   predicate((UseSSE>=1) && (UseAVX == 0));
2182   match(Set dst (AddF dst (LoadF src)));
2183 
2184   format %{ "addss   $dst, $src" %}
2185   ins_cost(150);
2186   ins_encode %{
2187     __ addss($dst$$XMMRegister, $src$$Address);
2188   %}
2189   ins_pipe(pipe_slow);
2190 %}
2191 
2192 instruct addF_imm(regF dst, immF con) %{
2193   predicate((UseSSE>=1) && (UseAVX == 0));
2194   match(Set dst (AddF dst con));
2195   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2196   ins_cost(150);
2197   ins_encode %{
2198     __ addss($dst$$XMMRegister, $constantaddress($con));
2199   %}
2200   ins_pipe(pipe_slow);
2201 %}
2202 
2203 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2204   predicate(UseAVX > 0);
2205   match(Set dst (AddF src1 src2));
2206 
2207   format %{ "vaddss  $dst, $src1, $src2" %}
2208   ins_cost(150);
2209   ins_encode %{
2210     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2211   %}
2212   ins_pipe(pipe_slow);
2213 %}
2214 
2215 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2216   predicate(UseAVX > 0);
2217   match(Set dst (AddF src1 (LoadF src2)));
2218 
2219   format %{ "vaddss  $dst, $src1, $src2" %}
2220   ins_cost(150);
2221   ins_encode %{
2222     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2223   %}
2224   ins_pipe(pipe_slow);
2225 %}
2226 
2227 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2228   predicate(UseAVX > 0);
2229   match(Set dst (AddF src con));
2230 
2231   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2232   ins_cost(150);
2233   ins_encode %{
2234     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2235   %}
2236   ins_pipe(pipe_slow);
2237 %}
2238 
2239 instruct addD_reg(regD dst, regD src) %{
2240   predicate((UseSSE>=2) && (UseAVX == 0));
2241   match(Set dst (AddD dst src));
2242 
2243   format %{ "addsd   $dst, $src" %}
2244   ins_cost(150);
2245   ins_encode %{
2246     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2247   %}
2248   ins_pipe(pipe_slow);
2249 %}
2250 
2251 instruct addD_mem(regD dst, memory src) %{
2252   predicate((UseSSE>=2) && (UseAVX == 0));
2253   match(Set dst (AddD dst (LoadD src)));
2254 
2255   format %{ "addsd   $dst, $src" %}
2256   ins_cost(150);
2257   ins_encode %{
2258     __ addsd($dst$$XMMRegister, $src$$Address);
2259   %}
2260   ins_pipe(pipe_slow);
2261 %}
2262 
2263 instruct addD_imm(regD dst, immD con) %{
2264   predicate((UseSSE>=2) && (UseAVX == 0));
2265   match(Set dst (AddD dst con));
2266   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2267   ins_cost(150);
2268   ins_encode %{
2269     __ addsd($dst$$XMMRegister, $constantaddress($con));
2270   %}
2271   ins_pipe(pipe_slow);
2272 %}
2273 
2274 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2275   predicate(UseAVX > 0);
2276   match(Set dst (AddD src1 src2));
2277 
2278   format %{ "vaddsd  $dst, $src1, $src2" %}
2279   ins_cost(150);
2280   ins_encode %{
2281     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2282   %}
2283   ins_pipe(pipe_slow);
2284 %}
2285 
2286 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2287   predicate(UseAVX > 0);
2288   match(Set dst (AddD src1 (LoadD src2)));
2289 
2290   format %{ "vaddsd  $dst, $src1, $src2" %}
2291   ins_cost(150);
2292   ins_encode %{
2293     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2294   %}
2295   ins_pipe(pipe_slow);
2296 %}
2297 
2298 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2299   predicate(UseAVX > 0);
2300   match(Set dst (AddD src con));
2301 
2302   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2303   ins_cost(150);
2304   ins_encode %{
2305     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2306   %}
2307   ins_pipe(pipe_slow);
2308 %}
2309 
2310 instruct subF_reg(regF dst, regF src) %{
2311   predicate((UseSSE>=1) && (UseAVX == 0));
2312   match(Set dst (SubF dst src));
2313 
2314   format %{ "subss   $dst, $src" %}
2315   ins_cost(150);
2316   ins_encode %{
2317     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2318   %}
2319   ins_pipe(pipe_slow);
2320 %}
2321 
2322 instruct subF_mem(regF dst, memory src) %{
2323   predicate((UseSSE>=1) && (UseAVX == 0));
2324   match(Set dst (SubF dst (LoadF src)));
2325 
2326   format %{ "subss   $dst, $src" %}
2327   ins_cost(150);
2328   ins_encode %{
2329     __ subss($dst$$XMMRegister, $src$$Address);
2330   %}
2331   ins_pipe(pipe_slow);
2332 %}
2333 
2334 instruct subF_imm(regF dst, immF con) %{
2335   predicate((UseSSE>=1) && (UseAVX == 0));
2336   match(Set dst (SubF dst con));
2337   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2338   ins_cost(150);
2339   ins_encode %{
2340     __ subss($dst$$XMMRegister, $constantaddress($con));
2341   %}
2342   ins_pipe(pipe_slow);
2343 %}
2344 
2345 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2346   predicate(UseAVX > 0);
2347   match(Set dst (SubF src1 src2));
2348 
2349   format %{ "vsubss  $dst, $src1, $src2" %}
2350   ins_cost(150);
2351   ins_encode %{
2352     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2353   %}
2354   ins_pipe(pipe_slow);
2355 %}
2356 
2357 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2358   predicate(UseAVX > 0);
2359   match(Set dst (SubF src1 (LoadF src2)));
2360 
2361   format %{ "vsubss  $dst, $src1, $src2" %}
2362   ins_cost(150);
2363   ins_encode %{
2364     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2365   %}
2366   ins_pipe(pipe_slow);
2367 %}
2368 
2369 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2370   predicate(UseAVX > 0);
2371   match(Set dst (SubF src con));
2372 
2373   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2374   ins_cost(150);
2375   ins_encode %{
2376     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2377   %}
2378   ins_pipe(pipe_slow);
2379 %}
2380 
2381 instruct subD_reg(regD dst, regD src) %{
2382   predicate((UseSSE>=2) && (UseAVX == 0));
2383   match(Set dst (SubD dst src));
2384 
2385   format %{ "subsd   $dst, $src" %}
2386   ins_cost(150);
2387   ins_encode %{
2388     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2389   %}
2390   ins_pipe(pipe_slow);
2391 %}
2392 
2393 instruct subD_mem(regD dst, memory src) %{
2394   predicate((UseSSE>=2) && (UseAVX == 0));
2395   match(Set dst (SubD dst (LoadD src)));
2396 
2397   format %{ "subsd   $dst, $src" %}
2398   ins_cost(150);
2399   ins_encode %{
2400     __ subsd($dst$$XMMRegister, $src$$Address);
2401   %}
2402   ins_pipe(pipe_slow);
2403 %}
2404 
2405 instruct subD_imm(regD dst, immD con) %{
2406   predicate((UseSSE>=2) && (UseAVX == 0));
2407   match(Set dst (SubD dst con));
2408   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2409   ins_cost(150);
2410   ins_encode %{
2411     __ subsd($dst$$XMMRegister, $constantaddress($con));
2412   %}
2413   ins_pipe(pipe_slow);
2414 %}
2415 
2416 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2417   predicate(UseAVX > 0);
2418   match(Set dst (SubD src1 src2));
2419 
2420   format %{ "vsubsd  $dst, $src1, $src2" %}
2421   ins_cost(150);
2422   ins_encode %{
2423     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2424   %}
2425   ins_pipe(pipe_slow);
2426 %}
2427 
2428 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2429   predicate(UseAVX > 0);
2430   match(Set dst (SubD src1 (LoadD src2)));
2431 
2432   format %{ "vsubsd  $dst, $src1, $src2" %}
2433   ins_cost(150);
2434   ins_encode %{
2435     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2436   %}
2437   ins_pipe(pipe_slow);
2438 %}
2439 
2440 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2441   predicate(UseAVX > 0);
2442   match(Set dst (SubD src con));
2443 
2444   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2445   ins_cost(150);
2446   ins_encode %{
2447     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2448   %}
2449   ins_pipe(pipe_slow);
2450 %}
2451 
2452 instruct mulF_reg(regF dst, regF src) %{
2453   predicate((UseSSE>=1) && (UseAVX == 0));
2454   match(Set dst (MulF dst src));
2455 
2456   format %{ "mulss   $dst, $src" %}
2457   ins_cost(150);
2458   ins_encode %{
2459     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2460   %}
2461   ins_pipe(pipe_slow);
2462 %}
2463 
2464 instruct mulF_mem(regF dst, memory src) %{
2465   predicate((UseSSE>=1) && (UseAVX == 0));
2466   match(Set dst (MulF dst (LoadF src)));
2467 
2468   format %{ "mulss   $dst, $src" %}
2469   ins_cost(150);
2470   ins_encode %{
2471     __ mulss($dst$$XMMRegister, $src$$Address);
2472   %}
2473   ins_pipe(pipe_slow);
2474 %}
2475 
2476 instruct mulF_imm(regF dst, immF con) %{
2477   predicate((UseSSE>=1) && (UseAVX == 0));
2478   match(Set dst (MulF dst con));
2479   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2480   ins_cost(150);
2481   ins_encode %{
2482     __ mulss($dst$$XMMRegister, $constantaddress($con));
2483   %}
2484   ins_pipe(pipe_slow);
2485 %}
2486 
2487 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2488   predicate(UseAVX > 0);
2489   match(Set dst (MulF src1 src2));
2490 
2491   format %{ "vmulss  $dst, $src1, $src2" %}
2492   ins_cost(150);
2493   ins_encode %{
2494     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2495   %}
2496   ins_pipe(pipe_slow);
2497 %}
2498 
2499 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2500   predicate(UseAVX > 0);
2501   match(Set dst (MulF src1 (LoadF src2)));
2502 
2503   format %{ "vmulss  $dst, $src1, $src2" %}
2504   ins_cost(150);
2505   ins_encode %{
2506     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2507   %}
2508   ins_pipe(pipe_slow);
2509 %}
2510 
2511 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2512   predicate(UseAVX > 0);
2513   match(Set dst (MulF src con));
2514 
2515   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2516   ins_cost(150);
2517   ins_encode %{
2518     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2519   %}
2520   ins_pipe(pipe_slow);
2521 %}
2522 
2523 instruct mulD_reg(regD dst, regD src) %{
2524   predicate((UseSSE>=2) && (UseAVX == 0));
2525   match(Set dst (MulD dst src));
2526 
2527   format %{ "mulsd   $dst, $src" %}
2528   ins_cost(150);
2529   ins_encode %{
2530     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2531   %}
2532   ins_pipe(pipe_slow);
2533 %}
2534 
2535 instruct mulD_mem(regD dst, memory src) %{
2536   predicate((UseSSE>=2) && (UseAVX == 0));
2537   match(Set dst (MulD dst (LoadD src)));
2538 
2539   format %{ "mulsd   $dst, $src" %}
2540   ins_cost(150);
2541   ins_encode %{
2542     __ mulsd($dst$$XMMRegister, $src$$Address);
2543   %}
2544   ins_pipe(pipe_slow);
2545 %}
2546 
2547 instruct mulD_imm(regD dst, immD con) %{
2548   predicate((UseSSE>=2) && (UseAVX == 0));
2549   match(Set dst (MulD dst con));
2550   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2551   ins_cost(150);
2552   ins_encode %{
2553     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2554   %}
2555   ins_pipe(pipe_slow);
2556 %}
2557 
2558 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2559   predicate(UseAVX > 0);
2560   match(Set dst (MulD src1 src2));
2561 
2562   format %{ "vmulsd  $dst, $src1, $src2" %}
2563   ins_cost(150);
2564   ins_encode %{
2565     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2566   %}
2567   ins_pipe(pipe_slow);
2568 %}
2569 
2570 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2571   predicate(UseAVX > 0);
2572   match(Set dst (MulD src1 (LoadD src2)));
2573 
2574   format %{ "vmulsd  $dst, $src1, $src2" %}
2575   ins_cost(150);
2576   ins_encode %{
2577     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2578   %}
2579   ins_pipe(pipe_slow);
2580 %}
2581 
2582 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2583   predicate(UseAVX > 0);
2584   match(Set dst (MulD src con));
2585 
2586   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2587   ins_cost(150);
2588   ins_encode %{
2589     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2590   %}
2591   ins_pipe(pipe_slow);
2592 %}
2593 
2594 instruct divF_reg(regF dst, regF src) %{
2595   predicate((UseSSE>=1) && (UseAVX == 0));
2596   match(Set dst (DivF dst src));
2597 
2598   format %{ "divss   $dst, $src" %}
2599   ins_cost(150);
2600   ins_encode %{
2601     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2602   %}
2603   ins_pipe(pipe_slow);
2604 %}
2605 
2606 instruct divF_mem(regF dst, memory src) %{
2607   predicate((UseSSE>=1) && (UseAVX == 0));
2608   match(Set dst (DivF dst (LoadF src)));
2609 
2610   format %{ "divss   $dst, $src" %}
2611   ins_cost(150);
2612   ins_encode %{
2613     __ divss($dst$$XMMRegister, $src$$Address);
2614   %}
2615   ins_pipe(pipe_slow);
2616 %}
2617 
2618 instruct divF_imm(regF dst, immF con) %{
2619   predicate((UseSSE>=1) && (UseAVX == 0));
2620   match(Set dst (DivF dst con));
2621   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2622   ins_cost(150);
2623   ins_encode %{
2624     __ divss($dst$$XMMRegister, $constantaddress($con));
2625   %}
2626   ins_pipe(pipe_slow);
2627 %}
2628 
2629 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2630   predicate(UseAVX > 0);
2631   match(Set dst (DivF src1 src2));
2632 
2633   format %{ "vdivss  $dst, $src1, $src2" %}
2634   ins_cost(150);
2635   ins_encode %{
2636     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2637   %}
2638   ins_pipe(pipe_slow);
2639 %}
2640 
2641 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2642   predicate(UseAVX > 0);
2643   match(Set dst (DivF src1 (LoadF src2)));
2644 
2645   format %{ "vdivss  $dst, $src1, $src2" %}
2646   ins_cost(150);
2647   ins_encode %{
2648     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2649   %}
2650   ins_pipe(pipe_slow);
2651 %}
2652 
2653 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2654   predicate(UseAVX > 0);
2655   match(Set dst (DivF src con));
2656 
2657   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2658   ins_cost(150);
2659   ins_encode %{
2660     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2661   %}
2662   ins_pipe(pipe_slow);
2663 %}
2664 
2665 instruct divD_reg(regD dst, regD src) %{
2666   predicate((UseSSE>=2) && (UseAVX == 0));
2667   match(Set dst (DivD dst src));
2668 
2669   format %{ "divsd   $dst, $src" %}
2670   ins_cost(150);
2671   ins_encode %{
2672     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2673   %}
2674   ins_pipe(pipe_slow);
2675 %}
2676 
2677 instruct divD_mem(regD dst, memory src) %{
2678   predicate((UseSSE>=2) && (UseAVX == 0));
2679   match(Set dst (DivD dst (LoadD src)));
2680 
2681   format %{ "divsd   $dst, $src" %}
2682   ins_cost(150);
2683   ins_encode %{
2684     __ divsd($dst$$XMMRegister, $src$$Address);
2685   %}
2686   ins_pipe(pipe_slow);
2687 %}
2688 
2689 instruct divD_imm(regD dst, immD con) %{
2690   predicate((UseSSE>=2) && (UseAVX == 0));
2691   match(Set dst (DivD dst con));
2692   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2693   ins_cost(150);
2694   ins_encode %{
2695     __ divsd($dst$$XMMRegister, $constantaddress($con));
2696   %}
2697   ins_pipe(pipe_slow);
2698 %}
2699 
2700 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2701   predicate(UseAVX > 0);
2702   match(Set dst (DivD src1 src2));
2703 
2704   format %{ "vdivsd  $dst, $src1, $src2" %}
2705   ins_cost(150);
2706   ins_encode %{
2707     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2708   %}
2709   ins_pipe(pipe_slow);
2710 %}
2711 
2712 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2713   predicate(UseAVX > 0);
2714   match(Set dst (DivD src1 (LoadD src2)));
2715 
2716   format %{ "vdivsd  $dst, $src1, $src2" %}
2717   ins_cost(150);
2718   ins_encode %{
2719     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2720   %}
2721   ins_pipe(pipe_slow);
2722 %}
2723 
2724 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2725   predicate(UseAVX > 0);
2726   match(Set dst (DivD src con));
2727 
2728   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2729   ins_cost(150);
2730   ins_encode %{
2731     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2732   %}
2733   ins_pipe(pipe_slow);
2734 %}
2735 
2736 instruct absF_reg(regF dst) %{
2737   predicate((UseSSE>=1) && (UseAVX == 0));
2738   match(Set dst (AbsF dst));
2739   ins_cost(150);
2740   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2741   ins_encode %{
2742     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2743   %}
2744   ins_pipe(pipe_slow);
2745 %}
2746 
2747 instruct absF_reg_reg(regF dst, regF src) %{
2748   predicate(VM_Version::supports_avxonly());
2749   match(Set dst (AbsF src));
2750   ins_cost(150);
2751   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2752   ins_encode %{
2753     int vector_len = 0;
2754     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2755               ExternalAddress(float_signmask()), vector_len);
2756   %}
2757   ins_pipe(pipe_slow);
2758 %}
2759 
2760 #ifdef _LP64
2761 instruct absF_reg_reg_evex(regF dst, regF src) %{
2762   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2763   match(Set dst (AbsF src));
2764   ins_cost(150);
2765   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2766   ins_encode %{
2767     int vector_len = 0;
2768     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2769               ExternalAddress(float_signmask()), vector_len);
2770   %}
2771   ins_pipe(pipe_slow);
2772 %}
2773 
2774 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2775   predicate(VM_Version::supports_avx512novl());
2776   match(Set dst (AbsF src1));
2777   effect(TEMP src2);
2778   ins_cost(150);
2779   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2780   ins_encode %{
2781     int vector_len = 0;
2782     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2783               ExternalAddress(float_signmask()), vector_len);
2784   %}
2785   ins_pipe(pipe_slow);
2786 %}
2787 #else // _LP64
2788 instruct absF_reg_reg_evex(regF dst, regF src) %{
2789   predicate(UseAVX > 2);
2790   match(Set dst (AbsF src));
2791   ins_cost(150);
2792   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2793   ins_encode %{
2794     int vector_len = 0;
2795     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2796               ExternalAddress(float_signmask()), vector_len);
2797   %}
2798   ins_pipe(pipe_slow);
2799 %}
2800 #endif
2801 
2802 instruct absD_reg(regD dst) %{
2803   predicate((UseSSE>=2) && (UseAVX == 0));
2804   match(Set dst (AbsD dst));
2805   ins_cost(150);
2806   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2807             "# abs double by sign masking" %}
2808   ins_encode %{
2809     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2810   %}
2811   ins_pipe(pipe_slow);
2812 %}
2813 
2814 instruct absD_reg_reg(regD dst, regD src) %{
2815   predicate(VM_Version::supports_avxonly());
2816   match(Set dst (AbsD src));
2817   ins_cost(150);
2818   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2819             "# abs double by sign masking" %}
2820   ins_encode %{
2821     int vector_len = 0;
2822     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2823               ExternalAddress(double_signmask()), vector_len);
2824   %}
2825   ins_pipe(pipe_slow);
2826 %}
2827 
2828 #ifdef _LP64
2829 instruct absD_reg_reg_evex(regD dst, regD src) %{
2830   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2831   match(Set dst (AbsD src));
2832   ins_cost(150);
2833   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2834             "# abs double by sign masking" %}
2835   ins_encode %{
2836     int vector_len = 0;
2837     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2838               ExternalAddress(double_signmask()), vector_len);
2839   %}
2840   ins_pipe(pipe_slow);
2841 %}
2842 
2843 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2844   predicate(VM_Version::supports_avx512novl());
2845   match(Set dst (AbsD src1));
2846   effect(TEMP src2);
2847   ins_cost(150);
2848   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2849   ins_encode %{
2850     int vector_len = 0;
2851     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2852               ExternalAddress(double_signmask()), vector_len);
2853   %}
2854   ins_pipe(pipe_slow);
2855 %}
2856 #else // _LP64
2857 instruct absD_reg_reg_evex(regD dst, regD src) %{
2858   predicate(UseAVX > 2);
2859   match(Set dst (AbsD src));
2860   ins_cost(150);
2861   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2862             "# abs double by sign masking" %}
2863   ins_encode %{
2864     int vector_len = 0;
2865     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2866               ExternalAddress(double_signmask()), vector_len);
2867   %}
2868   ins_pipe(pipe_slow);
2869 %}
2870 #endif
2871 
2872 instruct negI_rReg_2(rRegI dst, rFlagsReg cr)
2873 %{
2874   match(Set dst (NegI dst));
2875   effect(KILL cr);
2876 
2877   format %{ "negl    $dst\t# int" %}
2878   ins_encode %{
2879     __ negl($dst$$Register);
2880   %}
2881   ins_pipe(ialu_reg);
2882 %}
2883 
2884 instruct negL_rReg_2(rRegL dst, rFlagsReg cr)
2885 %{
2886   match(Set dst (NegL dst));
2887   effect(KILL cr);
2888 
2889   format %{ "negq    $dst\t# int" %}
2890   ins_encode %{
2891     __ negq($dst$$Register);
2892   %}
2893   ins_pipe(ialu_reg);
2894 %}
2895 
2896 instruct negF_reg(regF dst) %{
2897   predicate((UseSSE>=1) && (UseAVX == 0));
2898   match(Set dst (NegF dst));
2899   ins_cost(150);
2900   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2901   ins_encode %{
2902     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2903   %}
2904   ins_pipe(pipe_slow);
2905 %}
2906 
2907 instruct negF_reg_reg(regF dst, regF src) %{
2908   predicate(UseAVX > 0);
2909   match(Set dst (NegF src));
2910   ins_cost(150);
2911   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2912   ins_encode %{
2913     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2914                  ExternalAddress(float_signflip()));
2915   %}
2916   ins_pipe(pipe_slow);
2917 %}
2918 
2919 instruct negD_reg(regD dst) %{
2920   predicate((UseSSE>=2) && (UseAVX == 0));
2921   match(Set dst (NegD dst));
2922   ins_cost(150);
2923   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2924             "# neg double by sign flipping" %}
2925   ins_encode %{
2926     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2927   %}
2928   ins_pipe(pipe_slow);
2929 %}
2930 
2931 instruct negD_reg_reg(regD dst, regD src) %{
2932   predicate(UseAVX > 0);
2933   match(Set dst (NegD src));
2934   ins_cost(150);
2935   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2936             "# neg double by sign flipping" %}
2937   ins_encode %{
2938     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2939                  ExternalAddress(double_signflip()));
2940   %}
2941   ins_pipe(pipe_slow);
2942 %}
2943 
2944 instruct sqrtF_reg(regF dst, regF src) %{
2945   predicate(UseSSE>=1);
2946   match(Set dst (SqrtF src));
2947 
2948   format %{ "sqrtss  $dst, $src" %}
2949   ins_cost(150);
2950   ins_encode %{
2951     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2952   %}
2953   ins_pipe(pipe_slow);
2954 %}
2955 
2956 instruct sqrtF_mem(regF dst, memory src) %{
2957   predicate(UseSSE>=1);
2958   match(Set dst (SqrtF (LoadF src)));
2959 
2960   format %{ "sqrtss  $dst, $src" %}
2961   ins_cost(150);
2962   ins_encode %{
2963     __ sqrtss($dst$$XMMRegister, $src$$Address);
2964   %}
2965   ins_pipe(pipe_slow);
2966 %}
2967 
2968 instruct sqrtF_imm(regF dst, immF con) %{
2969   predicate(UseSSE>=1);
2970   match(Set dst (SqrtF con));
2971 
2972   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2973   ins_cost(150);
2974   ins_encode %{
2975     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2976   %}
2977   ins_pipe(pipe_slow);
2978 %}
2979 
2980 instruct sqrtD_reg(regD dst, regD src) %{
2981   predicate(UseSSE>=2);
2982   match(Set dst (SqrtD src));
2983 
2984   format %{ "sqrtsd  $dst, $src" %}
2985   ins_cost(150);
2986   ins_encode %{
2987     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2988   %}
2989   ins_pipe(pipe_slow);
2990 %}
2991 
2992 instruct sqrtD_mem(regD dst, memory src) %{
2993   predicate(UseSSE>=2);
2994   match(Set dst (SqrtD (LoadD src)));
2995 
2996   format %{ "sqrtsd  $dst, $src" %}
2997   ins_cost(150);
2998   ins_encode %{
2999     __ sqrtsd($dst$$XMMRegister, $src$$Address);
3000   %}
3001   ins_pipe(pipe_slow);
3002 %}
3003 
3004 instruct sqrtD_imm(regD dst, immD con) %{
3005   predicate(UseSSE>=2);
3006   match(Set dst (SqrtD con));
3007   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3008   ins_cost(150);
3009   ins_encode %{
3010     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3011   %}
3012   ins_pipe(pipe_slow);
3013 %}
3014 
3015 instruct onspinwait() %{
3016   match(OnSpinWait);
3017   ins_cost(200);
3018 
3019   format %{
3020     $$template
3021     if (os::is_MP()) {
3022       $$emit$$"pause\t! membar_onspinwait"
3023     } else {
3024       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
3025     }
3026   %}
3027   ins_encode %{
3028     __ pause();
3029   %}
3030   ins_pipe(pipe_slow);
3031 %}
3032 
3033 // a * b + c
3034 instruct fmaD_reg(regD a, regD b, regD c) %{
3035   predicate(UseFMA);
3036   match(Set c (FmaD  c (Binary a b)));
3037   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3038   ins_cost(150);
3039   ins_encode %{
3040     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3041   %}
3042   ins_pipe( pipe_slow );
3043 %}
3044 
3045 // a * b + c
3046 instruct fmaF_reg(regF a, regF b, regF c) %{
3047   predicate(UseFMA);
3048   match(Set c (FmaF  c (Binary a b)));
3049   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3050   ins_cost(150);
3051   ins_encode %{
3052     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3053   %}
3054   ins_pipe( pipe_slow );
3055 %}
3056 
3057 // ====================VECTOR INSTRUCTIONS=====================================
3058 
3059 instruct reinterpretS(vecS dst) %{
3060   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3061   match(Set dst (VectorReinterpret dst));
3062   ins_cost(125);
3063   format %{ " # reinterpret $dst" %}
3064   ins_encode %{
3065     // empty
3066   %}
3067   ins_pipe( pipe_slow );
3068 %}
3069 
3070 instruct reinterpretS2D(vecD dst, vecS src, rRegL scratch) %{
3071   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3072   match(Set dst (VectorReinterpret src));
3073   ins_cost(125);
3074   effect(TEMP dst, TEMP scratch);
3075   format %{ " # reinterpret $dst,$src" %}
3076   ins_encode %{
3077     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3078     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3079   %}
3080   ins_pipe( pipe_slow );
3081 %}
3082 
3083 instruct reinterpretS2D_avx(vecD dst, vecS src, rRegL scratch) %{
3084   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3085   match(Set dst (VectorReinterpret src));
3086   ins_cost(125);
3087   effect(TEMP dst, TEMP scratch);
3088   format %{ " # reinterpret $dst,$src" %}
3089   ins_encode %{
3090     int vector_len = 0;
3091     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3092   %}
3093   ins_pipe( pipe_slow );
3094 %}
3095 
3096 instruct reinterpretS2X(vecX dst, vecS src, rRegL scratch) %{
3097   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3098   match(Set dst (VectorReinterpret src));
3099   ins_cost(125);
3100   effect(TEMP dst, TEMP scratch);
3101   format %{ " # reinterpret $dst,$src" %}
3102   ins_encode %{
3103     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3104     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3105   %}
3106   ins_pipe( pipe_slow );
3107 %}
3108 
3109 instruct reinterpretS2X_avx(vecX dst, vecS src, rRegL scratch) %{
3110   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3111   match(Set dst (VectorReinterpret src));
3112   ins_cost(125);
3113   effect(TEMP scratch);
3114   format %{ " # reinterpret $dst,$src" %}
3115   ins_encode %{
3116     int vector_len = 0;
3117     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3118   %}
3119   ins_pipe( pipe_slow );
3120 %}
3121 
3122 instruct reinterpretS2Y(vecY dst, vecS src, rRegL scratch) %{
3123   predicate(UseAVX >= 2 && n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3124   match(Set dst (VectorReinterpret src));
3125   ins_cost(125);
3126   effect(TEMP scratch);
3127   format %{ " # reinterpret $dst,$src" %}
3128   ins_encode %{
3129     int vector_len = 1;
3130     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3131   %}
3132   ins_pipe( pipe_slow );
3133 %}
3134 
3135 instruct reinterpretS2Z(vecZ dst, vecS src, rRegL scratch) %{
3136   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3137   match(Set dst (VectorReinterpret src));
3138   ins_cost(125);
3139   effect(TEMP scratch);
3140   format %{ " # reinterpret $dst,$src" %}
3141   ins_encode %{
3142     int vector_len = 2;
3143     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3144   %}
3145   ins_pipe( pipe_slow );
3146 %}
3147 
3148 instruct reinterpretD2S(vecS dst, vecD src) %{
3149   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3150   match(Set dst (VectorReinterpret src));
3151   ins_cost(125);
3152   format %{ " # reinterpret $dst,$src" %}
3153   ins_encode %{
3154     // If register is the same, then move is not needed.
3155     if ($dst$$XMMRegister != $src$$XMMRegister) {
3156       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3157     }
3158   %}
3159   ins_pipe( pipe_slow );
3160 %}
3161 
3162 instruct reinterpretD(vecD dst) %{
3163   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3164   match(Set dst (VectorReinterpret dst));
3165   ins_cost(125);
3166   format %{ " # reinterpret $dst" %}
3167   ins_encode %{
3168     // empty
3169   %}
3170   ins_pipe( pipe_slow );
3171 %}
3172 
3173 instruct reinterpretD2X(vecX dst, vecD src, rRegL scratch) %{
3174   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3175   match(Set dst (VectorReinterpret src));
3176   ins_cost(125);
3177   effect(TEMP dst, TEMP scratch);
3178   format %{ " # reinterpret $dst,$src" %}
3179   ins_encode %{
3180     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3181     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3182   %}
3183   ins_pipe( pipe_slow );
3184 %}
3185 
3186 instruct reinterpretD2X_avx(vecX dst, vecD src, rRegL scratch) %{
3187   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3188   match(Set dst (VectorReinterpret src));
3189   ins_cost(125);
3190   effect(TEMP dst, TEMP scratch);
3191   format %{ " # reinterpret $dst,$src" %}
3192   ins_encode %{
3193     int vector_len = 0;
3194     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3195   %}
3196   ins_pipe( pipe_slow );
3197 %}
3198 
3199 instruct reinterpretD2Y(vecY dst, vecD src, rRegL scratch) %{
3200   predicate(UseAVX >= 2 && n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3201   match(Set dst (VectorReinterpret src));
3202   ins_cost(125);
3203   effect(TEMP scratch);
3204   format %{ " # reinterpret $dst,$src" %}
3205   ins_encode %{
3206     int vector_len = 1;
3207     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3208   %}
3209   ins_pipe( pipe_slow );
3210 %}
3211 
3212 instruct reinterpretD2Z(vecZ dst, vecD src, rRegL scratch) %{
3213   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3214   match(Set dst (VectorReinterpret src));
3215   ins_cost(125);
3216   effect(TEMP scratch);
3217   format %{ " # reinterpret $dst,$src" %}
3218   ins_encode %{
3219     int vector_len = 2;
3220     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3221   %}
3222   ins_pipe( pipe_slow );
3223 %}
3224 
3225 instruct reinterpretX2S(vecS dst, vecX src) %{
3226   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3227   match(Set dst (VectorReinterpret src));
3228   ins_cost(125);
3229   format %{ " # reinterpret $dst,$src" %}
3230   ins_encode %{
3231     // If register is the same, then move is not needed.
3232     if ($dst$$XMMRegister != $src$$XMMRegister) {
3233       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3234     }
3235   %}
3236   ins_pipe( pipe_slow );
3237 %}
3238 
3239 instruct reinterpretX2D(vecD dst, vecX src) %{
3240   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3241   match(Set dst (VectorReinterpret src));
3242   ins_cost(125);
3243   format %{ " # reinterpret $dst,$src" %}
3244   ins_encode %{
3245     // If register is the same, then move is not needed.
3246     if ($dst$$XMMRegister != $src$$XMMRegister) {
3247       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3248     }
3249   %}
3250   ins_pipe( pipe_slow );
3251 %}
3252 
3253 instruct reinterpretX(vecX dst) %{
3254   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3255   match(Set dst (VectorReinterpret dst));
3256   ins_cost(125);
3257   format %{ " # reinterpret $dst" %}
3258   ins_encode %{
3259     // empty
3260   %}
3261   ins_pipe( pipe_slow );
3262 %}
3263 
3264 instruct reinterpretX2Y(vecY dst, vecX src) %{
3265   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3266   match(Set dst (VectorReinterpret src));
3267   ins_cost(125);
3268   effect(TEMP dst);
3269   format %{ " # reinterpret $dst,$src" %}
3270   ins_encode %{
3271     int vector_len = 1;
3272     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3273     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);  // just 128-bits need moved
3274   %}
3275   ins_pipe( pipe_slow );
3276 %}
3277 
3278 instruct reinterpretX2Z(vecZ dst, vecX src) %{
3279   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3280   match(Set dst (VectorReinterpret src));
3281   ins_cost(125);
3282   effect(TEMP dst);
3283   format %{ " # reinterpret $dst,$src" %}
3284   ins_encode %{
3285     int vector_len = 2;
3286     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3287     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);  // just 128-bits need moved
3288   %}
3289   ins_pipe( pipe_slow );
3290 %}
3291 
3292 instruct reinterpretY2S(vecS dst, vecY src) %{
3293   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3294   match(Set dst (VectorReinterpret src));
3295   ins_cost(125);
3296   format %{ " # reinterpret $dst,$src" %}
3297   ins_encode %{
3298     // If register is the same, then move is not needed.
3299     if ($dst$$XMMRegister != $src$$XMMRegister) {
3300       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3301     }
3302   %}
3303   ins_pipe( pipe_slow );
3304 %}
3305 
3306 instruct reinterpretY2D(vecD dst, vecY src) %{
3307   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3308   match(Set dst (VectorReinterpret src));
3309   ins_cost(125);
3310   format %{ " # reinterpret $dst,$src" %}
3311   ins_encode %{
3312     // If register is the same, then move is not needed.
3313     if ($dst$$XMMRegister != $src$$XMMRegister) {
3314       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3315     }
3316   %}
3317   ins_pipe( pipe_slow );
3318 %}
3319 
3320 instruct reinterpretY2X(vecX dst, vecY src) %{
3321   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3322   match(Set dst (VectorReinterpret src));
3323   ins_cost(125);
3324   format %{ " # reinterpret $dst,$src" %}
3325   ins_encode %{
3326     // If register is the same, then move is not needed.
3327     if ($dst$$XMMRegister != $src$$XMMRegister) {
3328       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3329     }
3330   %}
3331   ins_pipe( pipe_slow );
3332 %}
3333 
3334 instruct reinterpretY(vecY dst) %{
3335   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3336   match(Set dst (VectorReinterpret dst));
3337   ins_cost(125);
3338   format %{ " # reinterpret $dst" %}
3339   ins_encode %{
3340     // empty
3341   %}
3342   ins_pipe( pipe_slow );
3343 %}
3344 
3345 instruct reinterpretY2Z(vecZ dst, vecY src) %{
3346   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3347   match(Set dst (VectorReinterpret src));
3348   ins_cost(125);
3349   effect(TEMP dst);
3350   format %{ " # reinterpret $dst,$src" %}
3351   ins_encode %{
3352     int vector_len = 2;
3353     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3354     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3355   %}
3356   ins_pipe( pipe_slow );
3357 %}
3358 
3359 instruct reinterpretZ2S(vecS dst, vecZ src) %{
3360   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3361   match(Set dst (VectorReinterpret src));
3362   ins_cost(125);
3363   format %{ " # reinterpret $dst,$src" %}
3364   ins_encode %{
3365     // If register is the same, then move is not needed.
3366     if ($dst$$XMMRegister != $src$$XMMRegister) {
3367       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3368     }
3369   %}
3370   ins_pipe( pipe_slow );
3371 %}
3372 
3373 instruct reinterpretZ2D(vecD dst, vecZ src) %{
3374   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3375   match(Set dst (VectorReinterpret src));
3376   ins_cost(125);
3377   format %{ " # reinterpret $dst,$src" %}
3378   ins_encode %{
3379     // If register is the same, then move is not needed.
3380     if ($dst$$XMMRegister != $src$$XMMRegister) {
3381       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3382     }
3383   %}
3384   ins_pipe( pipe_slow );
3385 %}
3386 
3387 instruct reinterpretZ2X(vecX dst, vecZ src) %{
3388   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3389   match(Set dst (VectorReinterpret src));
3390   ins_cost(125);
3391   format %{ " # reinterpret $dst,$src" %}
3392   ins_encode %{
3393     // If register is the same, then move is not needed.
3394     if ($dst$$XMMRegister != $src$$XMMRegister) {
3395       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3396     }
3397   %}
3398   ins_pipe( pipe_slow );
3399 %}
3400 
3401 instruct reinterpretZ2Y(vecY dst, vecZ src) %{
3402   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3403   match(Set dst (VectorReinterpret src));
3404   ins_cost(125);
3405   format %{ " # reinterpret $dst,$src" %}
3406   ins_encode %{
3407     // If register is the same, then move is not needed.
3408     if ($dst$$XMMRegister != $src$$XMMRegister) {
3409       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3410     }
3411   %}
3412   ins_pipe( pipe_slow );
3413 %}
3414 
3415 instruct reinterpretZ(vecZ dst) %{
3416   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3417   match(Set dst (VectorReinterpret dst));
3418   ins_cost(125);
3419   format %{ " # reinterpret $dst" %}
3420   ins_encode %{
3421     // empty
3422   %}
3423   ins_pipe( pipe_slow );
3424 %}
3425 
3426 // ==========
3427 
3428 // Load vectors (1 byte long)
3429 instruct loadV1(vecS dst, memory mem, rRegI tmp) %{
3430   predicate(n->as_LoadVector()->memory_size() == 1);
3431   match(Set dst (LoadVector mem));
3432   ins_cost(125);
3433   effect(TEMP tmp);
3434   format %{ "movzbl $tmp,$mem\n\t"
3435           "movd $dst,$tmp\t! load vector (1 byte)" %}
3436   ins_encode %{
3437     __ movzbl($tmp$$Register, $mem$$Address);
3438     __ movdl($dst$$XMMRegister, $tmp$$Register);
3439   %}
3440   ins_pipe( pipe_slow );
3441 %}
3442 
3443 // Load vectors (2 bytes long)
3444 instruct loadV2(vecS dst, memory mem, rRegI tmp) %{
3445   predicate(n->as_LoadVector()->memory_size() == 2);
3446   match(Set dst (LoadVector mem));
3447   ins_cost(125);
3448   effect(TEMP tmp);
3449   format %{ "movzwl $tmp,$mem\n\t"
3450           "movd $dst,$tmp\t! load vector (2 bytes)" %}
3451   ins_encode %{
3452     __ movzwl($tmp$$Register, $mem$$Address);
3453     __ movdl($dst$$XMMRegister, $tmp$$Register);
3454   %}
3455   ins_pipe( pipe_slow );
3456 %}
3457 
3458 // Load vectors (4 bytes long)
3459 instruct loadV4(vecS dst, memory mem) %{
3460   predicate(n->as_LoadVector()->memory_size() == 4);
3461   match(Set dst (LoadVector mem));
3462   ins_cost(125);
3463   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
3464   ins_encode %{
3465     __ movdl($dst$$XMMRegister, $mem$$Address);
3466   %}
3467   ins_pipe( pipe_slow );
3468 %}
3469 
3470 // Load vectors (8 bytes long)
3471 instruct loadV8(vecD dst, memory mem) %{
3472   predicate(n->as_LoadVector()->memory_size() == 8);
3473   match(Set dst (LoadVector mem));
3474   ins_cost(125);
3475   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
3476   ins_encode %{
3477     __ movq($dst$$XMMRegister, $mem$$Address);
3478   %}
3479   ins_pipe( pipe_slow );
3480 %}
3481 
3482 // Load vectors (16 bytes long)
3483 instruct loadV16(vecX dst, memory mem) %{
3484   predicate(n->as_LoadVector()->memory_size() == 16);
3485   match(Set dst (LoadVector mem));
3486   ins_cost(125);
3487   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3488   ins_encode %{
3489     __ movdqu($dst$$XMMRegister, $mem$$Address);
3490   %}
3491   ins_pipe( pipe_slow );
3492 %}
3493 
3494 // Load vectors (32 bytes long)
3495 instruct loadV32(vecY dst, memory mem) %{
3496   predicate(n->as_LoadVector()->memory_size() == 32);
3497   match(Set dst (LoadVector mem));
3498   ins_cost(125);
3499   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3500   ins_encode %{
3501     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3502   %}
3503   ins_pipe( pipe_slow );
3504 %}
3505 
3506 // Load vectors (64 bytes long)
3507 instruct loadV64_dword(vecZ dst, memory mem) %{
3508   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3509   match(Set dst (LoadVector mem));
3510   ins_cost(125);
3511   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3512   ins_encode %{
3513     int vector_len = 2;
3514     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3515   %}
3516   ins_pipe( pipe_slow );
3517 %}
3518 
3519 // Load vectors (64 bytes long)
3520 instruct loadV64_qword(vecZ dst, memory mem) %{
3521   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3522   match(Set dst (LoadVector mem));
3523   ins_cost(125);
3524   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3525   ins_encode %{
3526     int vector_len = 2;
3527     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3528   %}
3529   ins_pipe( pipe_slow );
3530 %}
3531 
3532 // Store vectors
3533 instruct storeV1(memory mem, vecS src, rRegI tmp) %{
3534   predicate(n->as_StoreVector()->memory_size() == 1);
3535   match(Set mem (StoreVector mem src));
3536   ins_cost(145);
3537   effect(TEMP tmp);
3538   format %{ "movd $tmp,$src\n\t"
3539           "movb $mem,$tmp\t! store vector (1 byte)" %}
3540   ins_encode %{
3541     __ movdl($tmp$$Register, $src$$XMMRegister);
3542     __ movb($mem$$Address, $tmp$$Register);
3543   %}
3544   ins_pipe( pipe_slow );
3545 %}
3546 
3547 instruct storeV2(memory mem, vecS src, rRegI tmp) %{
3548   predicate(n->as_StoreVector()->memory_size() == 2);
3549   match(Set mem (StoreVector mem src));
3550   ins_cost(145);
3551   effect(TEMP tmp);
3552   format %{ "movd $tmp,$src\n\t"
3553           "movw $mem,$tmp\t! store vector (2 bytes)" %}
3554   ins_encode %{
3555     __ movdl($tmp$$Register, $src$$XMMRegister);
3556     __ movw($mem$$Address, $tmp$$Register);
3557   %}
3558   ins_pipe( pipe_slow );
3559 %}
3560 
3561 instruct storeV4(memory mem, vecS src) %{
3562   predicate(n->as_StoreVector()->memory_size() == 4);
3563   match(Set mem (StoreVector mem src));
3564   ins_cost(145);
3565   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3566   ins_encode %{
3567     __ movdl($mem$$Address, $src$$XMMRegister);
3568   %}
3569   ins_pipe( pipe_slow );
3570 %}
3571 
3572 instruct storeV8(memory mem, vecD src) %{
3573   predicate(n->as_StoreVector()->memory_size() == 8);
3574   match(Set mem (StoreVector mem src));
3575   ins_cost(145);
3576   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3577   ins_encode %{
3578     __ movq($mem$$Address, $src$$XMMRegister);
3579   %}
3580   ins_pipe( pipe_slow );
3581 %}
3582 
3583 instruct storeV16(memory mem, vecX src) %{
3584   predicate(n->as_StoreVector()->memory_size() == 16);
3585   match(Set mem (StoreVector mem src));
3586   ins_cost(145);
3587   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3588   ins_encode %{
3589     __ movdqu($mem$$Address, $src$$XMMRegister);
3590   %}
3591   ins_pipe( pipe_slow );
3592 %}
3593 
3594 instruct storeV32(memory mem, vecY src) %{
3595   predicate(n->as_StoreVector()->memory_size() == 32);
3596   match(Set mem (StoreVector mem src));
3597   ins_cost(145);
3598   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3599   ins_encode %{
3600     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3601   %}
3602   ins_pipe( pipe_slow );
3603 %}
3604 
3605 instruct storeV64_dword(memory mem, vecZ src) %{
3606   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3607   match(Set mem (StoreVector mem src));
3608   ins_cost(145);
3609   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3610   ins_encode %{
3611     int vector_len = 2;
3612     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3613   %}
3614   ins_pipe( pipe_slow );
3615 %}
3616 
3617 instruct storeV64_qword(memory mem, vecZ src) %{
3618   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3619   match(Set mem (StoreVector mem src));
3620   ins_cost(145);
3621   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3622   ins_encode %{
3623     int vector_len = 2;
3624     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3625   %}
3626   ins_pipe( pipe_slow );
3627 %}
3628 
3629 // ====================LEGACY REPLICATE=======================================
3630 
3631 instruct Repl4B_mem(vecS dst, memory mem) %{
3632   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3633   match(Set dst (ReplicateB (LoadB mem)));
3634   format %{ "punpcklbw $dst,$mem\n\t"
3635             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3636   ins_encode %{
3637     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3638     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3639   %}
3640   ins_pipe( pipe_slow );
3641 %}
3642 
3643 instruct Repl8B_mem(vecD dst, memory mem) %{
3644   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3645   match(Set dst (ReplicateB (LoadB mem)));
3646   format %{ "punpcklbw $dst,$mem\n\t"
3647             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3648   ins_encode %{
3649     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3650     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3651   %}
3652   ins_pipe( pipe_slow );
3653 %}
3654 
3655 instruct Repl16B(vecX dst, rRegI src) %{
3656   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3657   match(Set dst (ReplicateB src));
3658   format %{ "movd    $dst,$src\n\t"
3659             "punpcklbw $dst,$dst\n\t"
3660             "pshuflw $dst,$dst,0x00\n\t"
3661             "punpcklqdq $dst,$dst\t! replicate16B" %}
3662   ins_encode %{
3663     __ movdl($dst$$XMMRegister, $src$$Register);
3664     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3665     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3666     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3667   %}
3668   ins_pipe( pipe_slow );
3669 %}
3670 
3671 instruct Repl16B_mem(vecX dst, memory mem) %{
3672   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3673   match(Set dst (ReplicateB (LoadB mem)));
3674   format %{ "punpcklbw $dst,$mem\n\t"
3675             "pshuflw $dst,$dst,0x00\n\t"
3676             "punpcklqdq $dst,$dst\t! replicate16B" %}
3677   ins_encode %{
3678     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3679     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3680     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3681   %}
3682   ins_pipe( pipe_slow );
3683 %}
3684 
3685 instruct Repl32B(vecY dst, rRegI src) %{
3686   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3687   match(Set dst (ReplicateB src));
3688   format %{ "movd    $dst,$src\n\t"
3689             "punpcklbw $dst,$dst\n\t"
3690             "pshuflw $dst,$dst,0x00\n\t"
3691             "punpcklqdq $dst,$dst\n\t"
3692             "vinserti128_high $dst,$dst\t! replicate32B" %}
3693   ins_encode %{
3694     __ movdl($dst$$XMMRegister, $src$$Register);
3695     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3696     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3697     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3698     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3699   %}
3700   ins_pipe( pipe_slow );
3701 %}
3702 
3703 instruct Repl32B_mem(vecY dst, memory mem) %{
3704   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3705   match(Set dst (ReplicateB (LoadB mem)));
3706   format %{ "punpcklbw $dst,$mem\n\t"
3707             "pshuflw $dst,$dst,0x00\n\t"
3708             "punpcklqdq $dst,$dst\n\t"
3709             "vinserti128_high $dst,$dst\t! replicate32B" %}
3710   ins_encode %{
3711     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3712     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3713     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3714     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3715   %}
3716   ins_pipe( pipe_slow );
3717 %}
3718 
3719 instruct Repl16B_imm(vecX dst, immI con) %{
3720   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3721   match(Set dst (ReplicateB con));
3722   format %{ "movq    $dst,[$constantaddress]\n\t"
3723             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3724   ins_encode %{
3725     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3726     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 instruct Repl32B_imm(vecY dst, immI con) %{
3732   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3733   match(Set dst (ReplicateB con));
3734   format %{ "movq    $dst,[$constantaddress]\n\t"
3735             "punpcklqdq $dst,$dst\n\t"
3736             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3737   ins_encode %{
3738     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3739     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3740     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3741   %}
3742   ins_pipe( pipe_slow );
3743 %}
3744 
3745 instruct Repl4S(vecD dst, rRegI src) %{
3746   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3747   match(Set dst (ReplicateS src));
3748   format %{ "movd    $dst,$src\n\t"
3749             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3750   ins_encode %{
3751     __ movdl($dst$$XMMRegister, $src$$Register);
3752     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3753   %}
3754   ins_pipe( pipe_slow );
3755 %}
3756 
3757 instruct Repl4S_mem(vecD dst, memory mem) %{
3758   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3759   match(Set dst (ReplicateS (LoadS mem)));
3760   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3761   ins_encode %{
3762     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3763   %}
3764   ins_pipe( pipe_slow );
3765 %}
3766 
3767 instruct Repl8S(vecX dst, rRegI src) %{
3768   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3769   match(Set dst (ReplicateS src));
3770   format %{ "movd    $dst,$src\n\t"
3771             "pshuflw $dst,$dst,0x00\n\t"
3772             "punpcklqdq $dst,$dst\t! replicate8S" %}
3773   ins_encode %{
3774     __ movdl($dst$$XMMRegister, $src$$Register);
3775     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3776     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3777   %}
3778   ins_pipe( pipe_slow );
3779 %}
3780 
3781 instruct Repl8S_mem(vecX dst, memory mem) %{
3782   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3783   match(Set dst (ReplicateS (LoadS mem)));
3784   format %{ "pshuflw $dst,$mem,0x00\n\t"
3785             "punpcklqdq $dst,$dst\t! replicate8S" %}
3786   ins_encode %{
3787     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3788     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3789   %}
3790   ins_pipe( pipe_slow );
3791 %}
3792 
3793 instruct Repl8S_imm(vecX dst, immI con) %{
3794   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3795   match(Set dst (ReplicateS con));
3796   format %{ "movq    $dst,[$constantaddress]\n\t"
3797             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3798   ins_encode %{
3799     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3800     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3801   %}
3802   ins_pipe( pipe_slow );
3803 %}
3804 
3805 instruct Repl16S(vecY dst, rRegI src) %{
3806   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3807   match(Set dst (ReplicateS src));
3808   format %{ "movd    $dst,$src\n\t"
3809             "pshuflw $dst,$dst,0x00\n\t"
3810             "punpcklqdq $dst,$dst\n\t"
3811             "vinserti128_high $dst,$dst\t! replicate16S" %}
3812   ins_encode %{
3813     __ movdl($dst$$XMMRegister, $src$$Register);
3814     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3815     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3816     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3817   %}
3818   ins_pipe( pipe_slow );
3819 %}
3820 
3821 instruct Repl16S_mem(vecY dst, memory mem) %{
3822   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3823   match(Set dst (ReplicateS (LoadS mem)));
3824   format %{ "pshuflw $dst,$mem,0x00\n\t"
3825             "punpcklqdq $dst,$dst\n\t"
3826             "vinserti128_high $dst,$dst\t! replicate16S" %}
3827   ins_encode %{
3828     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3829     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3830     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3831   %}
3832   ins_pipe( pipe_slow );
3833 %}
3834 
3835 instruct Repl16S_imm(vecY dst, immI con) %{
3836   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3837   match(Set dst (ReplicateS con));
3838   format %{ "movq    $dst,[$constantaddress]\n\t"
3839             "punpcklqdq $dst,$dst\n\t"
3840             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3841   ins_encode %{
3842     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3843     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3844     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3845   %}
3846   ins_pipe( pipe_slow );
3847 %}
3848 
3849 instruct Repl4I(vecX dst, rRegI src) %{
3850   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3851   match(Set dst (ReplicateI src));
3852   format %{ "movd    $dst,$src\n\t"
3853             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3854   ins_encode %{
3855     __ movdl($dst$$XMMRegister, $src$$Register);
3856     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3857   %}
3858   ins_pipe( pipe_slow );
3859 %}
3860 
3861 instruct Repl4I_mem(vecX dst, memory mem) %{
3862   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3863   match(Set dst (ReplicateI (LoadI mem)));
3864   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3865   ins_encode %{
3866     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3867   %}
3868   ins_pipe( pipe_slow );
3869 %}
3870 
3871 instruct Repl8I(vecY dst, rRegI src) %{
3872   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3873   match(Set dst (ReplicateI src));
3874   format %{ "movd    $dst,$src\n\t"
3875             "pshufd  $dst,$dst,0x00\n\t"
3876             "vinserti128_high $dst,$dst\t! replicate8I" %}
3877   ins_encode %{
3878     __ movdl($dst$$XMMRegister, $src$$Register);
3879     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3880     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3881   %}
3882   ins_pipe( pipe_slow );
3883 %}
3884 
3885 instruct Repl8I_mem(vecY dst, memory mem) %{
3886   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3887   match(Set dst (ReplicateI (LoadI mem)));
3888   format %{ "pshufd  $dst,$mem,0x00\n\t"
3889             "vinserti128_high $dst,$dst\t! replicate8I" %}
3890   ins_encode %{
3891     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3892     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3893   %}
3894   ins_pipe( pipe_slow );
3895 %}
3896 
3897 instruct Repl4I_imm(vecX dst, immI con) %{
3898   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3899   match(Set dst (ReplicateI con));
3900   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3901             "punpcklqdq $dst,$dst" %}
3902   ins_encode %{
3903     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3904     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3905   %}
3906   ins_pipe( pipe_slow );
3907 %}
3908 
3909 instruct Repl8I_imm(vecY dst, immI con) %{
3910   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3911   match(Set dst (ReplicateI con));
3912   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3913             "punpcklqdq $dst,$dst\n\t"
3914             "vinserti128_high $dst,$dst" %}
3915   ins_encode %{
3916     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3917     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3918     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3919   %}
3920   ins_pipe( pipe_slow );
3921 %}
3922 
3923 // Long could be loaded into xmm register directly from memory.
3924 instruct Repl2L_mem(vecX dst, memory mem) %{
3925   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3926   match(Set dst (ReplicateL (LoadL mem)));
3927   format %{ "movq    $dst,$mem\n\t"
3928             "punpcklqdq $dst,$dst\t! replicate2L" %}
3929   ins_encode %{
3930     __ movq($dst$$XMMRegister, $mem$$Address);
3931     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3932   %}
3933   ins_pipe( pipe_slow );
3934 %}
3935 
3936 // Replicate long (8 byte) scalar to be vector
3937 #ifdef _LP64
3938 instruct Repl4L(vecY dst, rRegL src) %{
3939   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3940   match(Set dst (ReplicateL src));
3941   format %{ "movdq   $dst,$src\n\t"
3942             "punpcklqdq $dst,$dst\n\t"
3943             "vinserti128_high $dst,$dst\t! replicate4L" %}
3944   ins_encode %{
3945     __ movdq($dst$$XMMRegister, $src$$Register);
3946     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3947     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3948   %}
3949   ins_pipe( pipe_slow );
3950 %}
3951 #else // _LP64
3952 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3953   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3954   match(Set dst (ReplicateL src));
3955   effect(TEMP dst, USE src, TEMP tmp);
3956   format %{ "movdl   $dst,$src.lo\n\t"
3957             "movdl   $tmp,$src.hi\n\t"
3958             "punpckldq $dst,$tmp\n\t"
3959             "punpcklqdq $dst,$dst\n\t"
3960             "vinserti128_high $dst,$dst\t! replicate4L" %}
3961   ins_encode %{
3962     __ movdl($dst$$XMMRegister, $src$$Register);
3963     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3964     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3965     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3966     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3967   %}
3968   ins_pipe( pipe_slow );
3969 %}
3970 #endif // _LP64
3971 
3972 instruct Repl4L_imm(vecY dst, immL con) %{
3973   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3974   match(Set dst (ReplicateL con));
3975   format %{ "movq    $dst,[$constantaddress]\n\t"
3976             "punpcklqdq $dst,$dst\n\t"
3977             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3978   ins_encode %{
3979     __ movq($dst$$XMMRegister, $constantaddress($con));
3980     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3981     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3982   %}
3983   ins_pipe( pipe_slow );
3984 %}
3985 
3986 instruct Repl4L_mem(vecY dst, memory mem) %{
3987   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3988   match(Set dst (ReplicateL (LoadL mem)));
3989   format %{ "movq    $dst,$mem\n\t"
3990             "punpcklqdq $dst,$dst\n\t"
3991             "vinserti128_high $dst,$dst\t! replicate4L" %}
3992   ins_encode %{
3993     __ movq($dst$$XMMRegister, $mem$$Address);
3994     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3995     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3996   %}
3997   ins_pipe( pipe_slow );
3998 %}
3999 
4000 instruct Repl2F_mem(vecD dst, memory mem) %{
4001   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4002   match(Set dst (ReplicateF (LoadF mem)));
4003   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
4004   ins_encode %{
4005     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4006   %}
4007   ins_pipe( pipe_slow );
4008 %}
4009 
4010 instruct Repl4F_mem(vecX dst, memory mem) %{
4011   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4012   match(Set dst (ReplicateF (LoadF mem)));
4013   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
4014   ins_encode %{
4015     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4016   %}
4017   ins_pipe( pipe_slow );
4018 %}
4019 
4020 instruct Repl8F(vecY dst, regF src) %{
4021   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4022   match(Set dst (ReplicateF src));
4023   format %{ "pshufd  $dst,$src,0x00\n\t"
4024             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4025   ins_encode %{
4026     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4027     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4028   %}
4029   ins_pipe( pipe_slow );
4030 %}
4031 
4032 instruct Repl8F_mem(vecY dst, memory mem) %{
4033   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4034   match(Set dst (ReplicateF (LoadF mem)));
4035   format %{ "pshufd  $dst,$mem,0x00\n\t"
4036             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4037   ins_encode %{
4038     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4039     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4040   %}
4041   ins_pipe( pipe_slow );
4042 %}
4043 
4044 instruct Repl2F_zero(vecD dst, immF0 zero) %{
4045   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4046   match(Set dst (ReplicateF zero));
4047   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
4048   ins_encode %{
4049     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4050   %}
4051   ins_pipe( fpu_reg_reg );
4052 %}
4053 
4054 instruct Repl4F_zero(vecX dst, immF0 zero) %{
4055   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4056   match(Set dst (ReplicateF zero));
4057   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
4058   ins_encode %{
4059     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4060   %}
4061   ins_pipe( fpu_reg_reg );
4062 %}
4063 
4064 instruct Repl8F_zero(vecY dst, immF0 zero) %{
4065   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
4066   match(Set dst (ReplicateF zero));
4067   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
4068   ins_encode %{
4069     int vector_len = 1;
4070     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4071   %}
4072   ins_pipe( fpu_reg_reg );
4073 %}
4074 
4075 instruct Repl2D_mem(vecX dst, memory mem) %{
4076   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4077   match(Set dst (ReplicateD (LoadD mem)));
4078   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
4079   ins_encode %{
4080     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4081   %}
4082   ins_pipe( pipe_slow );
4083 %}
4084 
4085 instruct Repl4D(vecY dst, regD src) %{
4086   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4087   match(Set dst (ReplicateD src));
4088   format %{ "pshufd  $dst,$src,0x44\n\t"
4089             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4090   ins_encode %{
4091     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4092     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4093   %}
4094   ins_pipe( pipe_slow );
4095 %}
4096 
4097 instruct Repl4D_mem(vecY dst, memory mem) %{
4098   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4099   match(Set dst (ReplicateD (LoadD mem)));
4100   format %{ "pshufd  $dst,$mem,0x44\n\t"
4101             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4102   ins_encode %{
4103     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4104     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4105   %}
4106   ins_pipe( pipe_slow );
4107 %}
4108 
4109 // Replicate double (8 byte) scalar zero to be vector
4110 instruct Repl2D_zero(vecX dst, immD0 zero) %{
4111   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4112   match(Set dst (ReplicateD zero));
4113   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
4114   ins_encode %{
4115     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4116   %}
4117   ins_pipe( fpu_reg_reg );
4118 %}
4119 
4120 instruct Repl4D_zero(vecY dst, immD0 zero) %{
4121   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4122   match(Set dst (ReplicateD zero));
4123   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
4124   ins_encode %{
4125     int vector_len = 1;
4126     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4127   %}
4128   ins_pipe( fpu_reg_reg );
4129 %}
4130 
4131 // ====================GENERIC REPLICATE==========================================
4132 
4133 // Replicate byte scalar to be vector
4134 instruct Repl4B(vecS dst, rRegI src) %{
4135   predicate(n->as_Vector()->length() == 4);
4136   match(Set dst (ReplicateB src));
4137   format %{ "movd    $dst,$src\n\t"
4138             "punpcklbw $dst,$dst\n\t"
4139             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
4140   ins_encode %{
4141     __ movdl($dst$$XMMRegister, $src$$Register);
4142     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4143     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4144   %}
4145   ins_pipe( pipe_slow );
4146 %}
4147 
4148 instruct Repl8B(vecD dst, rRegI src) %{
4149   predicate(n->as_Vector()->length() == 8);
4150   match(Set dst (ReplicateB src));
4151   format %{ "movd    $dst,$src\n\t"
4152             "punpcklbw $dst,$dst\n\t"
4153             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
4154   ins_encode %{
4155     __ movdl($dst$$XMMRegister, $src$$Register);
4156     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4157     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4158   %}
4159   ins_pipe( pipe_slow );
4160 %}
4161 
4162 // Replicate byte scalar immediate to be vector by loading from const table.
4163 instruct Repl4B_imm(vecS dst, immI con) %{
4164   predicate(n->as_Vector()->length() == 4);
4165   match(Set dst (ReplicateB con));
4166   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
4167   ins_encode %{
4168     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
4169   %}
4170   ins_pipe( pipe_slow );
4171 %}
4172 
4173 instruct Repl8B_imm(vecD dst, immI con) %{
4174   predicate(n->as_Vector()->length() == 8);
4175   match(Set dst (ReplicateB con));
4176   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
4177   ins_encode %{
4178     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4179   %}
4180   ins_pipe( pipe_slow );
4181 %}
4182 
4183 // Replicate byte scalar zero to be vector
4184 instruct Repl4B_zero(vecS dst, immI0 zero) %{
4185   predicate(n->as_Vector()->length() == 4);
4186   match(Set dst (ReplicateB zero));
4187   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
4188   ins_encode %{
4189     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4190   %}
4191   ins_pipe( fpu_reg_reg );
4192 %}
4193 
4194 instruct Repl8B_zero(vecD dst, immI0 zero) %{
4195   predicate(n->as_Vector()->length() == 8);
4196   match(Set dst (ReplicateB zero));
4197   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
4198   ins_encode %{
4199     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4200   %}
4201   ins_pipe( fpu_reg_reg );
4202 %}
4203 
4204 instruct Repl16B_zero(vecX dst, immI0 zero) %{
4205   predicate(n->as_Vector()->length() == 16);
4206   match(Set dst (ReplicateB zero));
4207   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
4208   ins_encode %{
4209     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4210   %}
4211   ins_pipe( fpu_reg_reg );
4212 %}
4213 
4214 instruct Repl32B_zero(vecY dst, immI0 zero) %{
4215   predicate(n->as_Vector()->length() == 32);
4216   match(Set dst (ReplicateB zero));
4217   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
4218   ins_encode %{
4219     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4220     int vector_len = 1;
4221     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4222   %}
4223   ins_pipe( fpu_reg_reg );
4224 %}
4225 
4226 // Replicate char/short (2 byte) scalar to be vector
4227 instruct Repl2S(vecS dst, rRegI src) %{
4228   predicate(n->as_Vector()->length() == 2);
4229   match(Set dst (ReplicateS src));
4230   format %{ "movd    $dst,$src\n\t"
4231             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
4232   ins_encode %{
4233     __ movdl($dst$$XMMRegister, $src$$Register);
4234     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4235   %}
4236   ins_pipe( fpu_reg_reg );
4237 %}
4238 
4239 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
4240 instruct Repl2S_imm(vecS dst, immI con) %{
4241   predicate(n->as_Vector()->length() == 2);
4242   match(Set dst (ReplicateS con));
4243   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4244   ins_encode %{
4245     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4246   %}
4247   ins_pipe( fpu_reg_reg );
4248 %}
4249 
4250 instruct Repl4S_imm(vecD dst, immI con) %{
4251   predicate(n->as_Vector()->length() == 4);
4252   match(Set dst (ReplicateS con));
4253   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4254   ins_encode %{
4255     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4256   %}
4257   ins_pipe( fpu_reg_reg );
4258 %}
4259 
4260 // Replicate char/short (2 byte) scalar zero to be vector
4261 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4262   predicate(n->as_Vector()->length() == 2);
4263   match(Set dst (ReplicateS zero));
4264   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4265   ins_encode %{
4266     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4267   %}
4268   ins_pipe( fpu_reg_reg );
4269 %}
4270 
4271 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4272   predicate(n->as_Vector()->length() == 4);
4273   match(Set dst (ReplicateS zero));
4274   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4275   ins_encode %{
4276     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4277   %}
4278   ins_pipe( fpu_reg_reg );
4279 %}
4280 
4281 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4282   predicate(n->as_Vector()->length() == 8);
4283   match(Set dst (ReplicateS zero));
4284   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4285   ins_encode %{
4286     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4287   %}
4288   ins_pipe( fpu_reg_reg );
4289 %}
4290 
4291 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4292   predicate(n->as_Vector()->length() == 16);
4293   match(Set dst (ReplicateS zero));
4294   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4295   ins_encode %{
4296     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4297     int vector_len = 1;
4298     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4299   %}
4300   ins_pipe( fpu_reg_reg );
4301 %}
4302 
4303 // Replicate integer (4 byte) scalar to be vector
4304 instruct Repl2I(vecD dst, rRegI src) %{
4305   predicate(n->as_Vector()->length() == 2);
4306   match(Set dst (ReplicateI src));
4307   format %{ "movd    $dst,$src\n\t"
4308             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4309   ins_encode %{
4310     __ movdl($dst$$XMMRegister, $src$$Register);
4311     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4312   %}
4313   ins_pipe( fpu_reg_reg );
4314 %}
4315 
4316 // Integer could be loaded into xmm register directly from memory.
4317 instruct Repl2I_mem(vecD dst, memory mem) %{
4318   predicate(n->as_Vector()->length() == 2);
4319   match(Set dst (ReplicateI (LoadI mem)));
4320   format %{ "movd    $dst,$mem\n\t"
4321             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4322   ins_encode %{
4323     __ movdl($dst$$XMMRegister, $mem$$Address);
4324     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4325   %}
4326   ins_pipe( fpu_reg_reg );
4327 %}
4328 
4329 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4330 instruct Repl2I_imm(vecD dst, immI con) %{
4331   predicate(n->as_Vector()->length() == 2);
4332   match(Set dst (ReplicateI con));
4333   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4334   ins_encode %{
4335     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4336   %}
4337   ins_pipe( fpu_reg_reg );
4338 %}
4339 
4340 // Replicate integer (4 byte) scalar zero to be vector
4341 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4342   predicate(n->as_Vector()->length() == 2);
4343   match(Set dst (ReplicateI zero));
4344   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4345   ins_encode %{
4346     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4347   %}
4348   ins_pipe( fpu_reg_reg );
4349 %}
4350 
4351 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4352   predicate(n->as_Vector()->length() == 4);
4353   match(Set dst (ReplicateI zero));
4354   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4355   ins_encode %{
4356     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4357   %}
4358   ins_pipe( fpu_reg_reg );
4359 %}
4360 
4361 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4362   predicate(n->as_Vector()->length() == 8);
4363   match(Set dst (ReplicateI zero));
4364   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4365   ins_encode %{
4366     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4367     int vector_len = 1;
4368     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4369   %}
4370   ins_pipe( fpu_reg_reg );
4371 %}
4372 
4373 // Replicate long (8 byte) scalar to be vector
4374 #ifdef _LP64
4375 instruct Repl2L(vecX dst, rRegL src) %{
4376   predicate(n->as_Vector()->length() == 2);
4377   match(Set dst (ReplicateL src));
4378   format %{ "movdq   $dst,$src\n\t"
4379             "punpcklqdq $dst,$dst\t! replicate2L" %}
4380   ins_encode %{
4381     __ movdq($dst$$XMMRegister, $src$$Register);
4382     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4383   %}
4384   ins_pipe( pipe_slow );
4385 %}
4386 #else // _LP64
4387 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
4388   predicate(n->as_Vector()->length() == 2);
4389   match(Set dst (ReplicateL src));
4390   effect(TEMP dst, USE src, TEMP tmp);
4391   format %{ "movdl   $dst,$src.lo\n\t"
4392             "movdl   $tmp,$src.hi\n\t"
4393             "punpckldq $dst,$tmp\n\t"
4394             "punpcklqdq $dst,$dst\t! replicate2L"%}
4395   ins_encode %{
4396     __ movdl($dst$$XMMRegister, $src$$Register);
4397     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4398     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4399     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4400   %}
4401   ins_pipe( pipe_slow );
4402 %}
4403 #endif // _LP64
4404 
4405 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4406 instruct Repl2L_imm(vecX dst, immL con) %{
4407   predicate(n->as_Vector()->length() == 2);
4408   match(Set dst (ReplicateL con));
4409   format %{ "movq    $dst,[$constantaddress]\n\t"
4410             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4411   ins_encode %{
4412     __ movq($dst$$XMMRegister, $constantaddress($con));
4413     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4414   %}
4415   ins_pipe( pipe_slow );
4416 %}
4417 
4418 // Replicate long (8 byte) scalar zero to be vector
4419 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4420   predicate(n->as_Vector()->length() == 2);
4421   match(Set dst (ReplicateL zero));
4422   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4423   ins_encode %{
4424     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4425   %}
4426   ins_pipe( fpu_reg_reg );
4427 %}
4428 
4429 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4430   predicate(n->as_Vector()->length() == 4);
4431   match(Set dst (ReplicateL zero));
4432   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4433   ins_encode %{
4434     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4435     int vector_len = 1;
4436     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4437   %}
4438   ins_pipe( fpu_reg_reg );
4439 %}
4440 
4441 // Replicate float (4 byte) scalar to be vector
4442 instruct Repl2F(vecD dst, regF src) %{
4443   predicate(n->as_Vector()->length() == 2);
4444   match(Set dst (ReplicateF src));
4445   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4446   ins_encode %{
4447     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4448   %}
4449   ins_pipe( fpu_reg_reg );
4450 %}
4451 
4452 instruct Repl4F(vecX dst, regF src) %{
4453   predicate(n->as_Vector()->length() == 4);
4454   match(Set dst (ReplicateF src));
4455   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4456   ins_encode %{
4457     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4458   %}
4459   ins_pipe( pipe_slow );
4460 %}
4461 
4462 // Replicate double (8 bytes) scalar to be vector
4463 instruct Repl2D(vecX dst, regD src) %{
4464   predicate(n->as_Vector()->length() == 2);
4465   match(Set dst (ReplicateD src));
4466   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4467   ins_encode %{
4468     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4469   %}
4470   ins_pipe( pipe_slow );
4471 %}
4472 
4473 // ====================EVEX REPLICATE=============================================
4474 
4475 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4476   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
4477   match(Set dst (ReplicateB (LoadB mem)));
4478   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4479   ins_encode %{
4480     int vector_len = 0;
4481     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4482   %}
4483   ins_pipe( pipe_slow );
4484 %}
4485 
4486 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4487   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4488   match(Set dst (ReplicateB (LoadB mem)));
4489   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4490   ins_encode %{
4491     int vector_len = 0;
4492     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4493   %}
4494   ins_pipe( pipe_slow );
4495 %}
4496 
4497 instruct Repl16B_evex(vecX dst, rRegI src) %{
4498   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4499   match(Set dst (ReplicateB src));
4500   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
4501   ins_encode %{
4502    int vector_len = 0;
4503     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4504   %}
4505   ins_pipe( pipe_slow );
4506 %}
4507 
4508 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4509   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4510   match(Set dst (ReplicateB (LoadB mem)));
4511   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4512   ins_encode %{
4513     int vector_len = 0;
4514     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4515   %}
4516   ins_pipe( pipe_slow );
4517 %}
4518 
4519 instruct Repl32B_evex(vecY dst, rRegI src) %{
4520   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
4521   match(Set dst (ReplicateB src));
4522   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
4523   ins_encode %{
4524    int vector_len = 1;
4525     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4526   %}
4527   ins_pipe( pipe_slow );
4528 %}
4529 
4530 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4531   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
4532   match(Set dst (ReplicateB (LoadB mem)));
4533   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4534   ins_encode %{
4535     int vector_len = 1;
4536     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4537   %}
4538   ins_pipe( pipe_slow );
4539 %}
4540 
4541 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4542   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
4543   match(Set dst (ReplicateB src));
4544   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
4545   ins_encode %{
4546    int vector_len = 2;
4547     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4548   %}
4549   ins_pipe( pipe_slow );
4550 %}
4551 
4552 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4553   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
4554   match(Set dst (ReplicateB (LoadB mem)));
4555   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4556   ins_encode %{
4557     int vector_len = 2;
4558     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4559   %}
4560   ins_pipe( pipe_slow );
4561 %}
4562 
4563 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4564   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4565   match(Set dst (ReplicateB con));
4566   format %{ "movq    $dst,[$constantaddress]\n\t"
4567             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4568   ins_encode %{
4569    int vector_len = 0;
4570     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4571     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4572   %}
4573   ins_pipe( pipe_slow );
4574 %}
4575 
4576 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4577   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
4578   match(Set dst (ReplicateB con));
4579   format %{ "movq    $dst,[$constantaddress]\n\t"
4580             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4581   ins_encode %{
4582    int vector_len = 1;
4583     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4584     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4585   %}
4586   ins_pipe( pipe_slow );
4587 %}
4588 
4589 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4590   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
4591   match(Set dst (ReplicateB con));
4592   format %{ "movq    $dst,[$constantaddress]\n\t"
4593             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4594   ins_encode %{
4595    int vector_len = 2;
4596     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4597     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4598   %}
4599   ins_pipe( pipe_slow );
4600 %}
4601 
4602 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4603   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4604   match(Set dst (ReplicateB zero));
4605   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4606   ins_encode %{
4607     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4608     int vector_len = 2;
4609     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4610   %}
4611   ins_pipe( fpu_reg_reg );
4612 %}
4613 
4614 instruct Repl4S_evex(vecD dst, rRegI src) %{
4615   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
4616   match(Set dst (ReplicateS src));
4617   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
4618   ins_encode %{
4619    int vector_len = 0;
4620     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4621   %}
4622   ins_pipe( pipe_slow );
4623 %}
4624 
4625 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4626   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
4627   match(Set dst (ReplicateS (LoadS mem)));
4628   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4629   ins_encode %{
4630     int vector_len = 0;
4631     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4632   %}
4633   ins_pipe( pipe_slow );
4634 %}
4635 
4636 instruct Repl8S_evex(vecX dst, rRegI src) %{
4637   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4638   match(Set dst (ReplicateS src));
4639   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
4640   ins_encode %{
4641    int vector_len = 0;
4642     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4643   %}
4644   ins_pipe( pipe_slow );
4645 %}
4646 
4647 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4648   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4649   match(Set dst (ReplicateS (LoadS mem)));
4650   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4651   ins_encode %{
4652     int vector_len = 0;
4653     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4654   %}
4655   ins_pipe( pipe_slow );
4656 %}
4657 
4658 instruct Repl16S_evex(vecY dst, rRegI src) %{
4659   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4660   match(Set dst (ReplicateS src));
4661   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
4662   ins_encode %{
4663    int vector_len = 1;
4664     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4665   %}
4666   ins_pipe( pipe_slow );
4667 %}
4668 
4669 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4670   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4671   match(Set dst (ReplicateS (LoadS mem)));
4672   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4673   ins_encode %{
4674     int vector_len = 1;
4675     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4676   %}
4677   ins_pipe( pipe_slow );
4678 %}
4679 
4680 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4681   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4682   match(Set dst (ReplicateS src));
4683   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
4684   ins_encode %{
4685    int vector_len = 2;
4686     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4687   %}
4688   ins_pipe( pipe_slow );
4689 %}
4690 
4691 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4692   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4693   match(Set dst (ReplicateS (LoadS mem)));
4694   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4695   ins_encode %{
4696     int vector_len = 2;
4697     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4698   %}
4699   ins_pipe( pipe_slow );
4700 %}
4701 
4702 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4703   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4704   match(Set dst (ReplicateS con));
4705   format %{ "movq    $dst,[$constantaddress]\n\t"
4706             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4707   ins_encode %{
4708    int vector_len = 0;
4709     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4710     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4711   %}
4712   ins_pipe( pipe_slow );
4713 %}
4714 
4715 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4716   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4717   match(Set dst (ReplicateS con));
4718   format %{ "movq    $dst,[$constantaddress]\n\t"
4719             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4720   ins_encode %{
4721    int vector_len = 1;
4722     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4723     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4724   %}
4725   ins_pipe( pipe_slow );
4726 %}
4727 
4728 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4729   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4730   match(Set dst (ReplicateS con));
4731   format %{ "movq    $dst,[$constantaddress]\n\t"
4732             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4733   ins_encode %{
4734    int vector_len = 2;
4735     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4736     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4737   %}
4738   ins_pipe( pipe_slow );
4739 %}
4740 
4741 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4742   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4743   match(Set dst (ReplicateS zero));
4744   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4745   ins_encode %{
4746     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4747     int vector_len = 2;
4748     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4749   %}
4750   ins_pipe( fpu_reg_reg );
4751 %}
4752 
4753 instruct Repl4I_evex(vecX dst, rRegI src) %{
4754   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4755   match(Set dst (ReplicateI src));
4756   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
4757   ins_encode %{
4758     int vector_len = 0;
4759     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4760   %}
4761   ins_pipe( pipe_slow );
4762 %}
4763 
4764 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4765   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4766   match(Set dst (ReplicateI (LoadI mem)));
4767   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4768   ins_encode %{
4769     int vector_len = 0;
4770     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 instruct Repl8I_evex(vecY dst, rRegI src) %{
4776   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4777   match(Set dst (ReplicateI src));
4778   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
4779   ins_encode %{
4780     int vector_len = 1;
4781     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4782   %}
4783   ins_pipe( pipe_slow );
4784 %}
4785 
4786 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4787   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4788   match(Set dst (ReplicateI (LoadI mem)));
4789   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4790   ins_encode %{
4791     int vector_len = 1;
4792     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4793   %}
4794   ins_pipe( pipe_slow );
4795 %}
4796 
4797 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4798   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4799   match(Set dst (ReplicateI src));
4800   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4801   ins_encode %{
4802     int vector_len = 2;
4803     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4804   %}
4805   ins_pipe( pipe_slow );
4806 %}
4807 
4808 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4809   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4810   match(Set dst (ReplicateI (LoadI mem)));
4811   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4812   ins_encode %{
4813     int vector_len = 2;
4814     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4815   %}
4816   ins_pipe( pipe_slow );
4817 %}
4818 
4819 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4820   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4821   match(Set dst (ReplicateI con));
4822   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4823             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4824   ins_encode %{
4825     int vector_len = 0;
4826     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4827     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4828   %}
4829   ins_pipe( pipe_slow );
4830 %}
4831 
4832 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4833   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4834   match(Set dst (ReplicateI con));
4835   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4836             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4837   ins_encode %{
4838     int vector_len = 1;
4839     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4840     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4841   %}
4842   ins_pipe( pipe_slow );
4843 %}
4844 
4845 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4846   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4847   match(Set dst (ReplicateI con));
4848   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4849             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4850   ins_encode %{
4851     int vector_len = 2;
4852     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4853     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4854   %}
4855   ins_pipe( pipe_slow );
4856 %}
4857 
4858 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4859   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4860   match(Set dst (ReplicateI zero));
4861   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4862   ins_encode %{
4863     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4864     int vector_len = 2;
4865     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4866   %}
4867   ins_pipe( fpu_reg_reg );
4868 %}
4869 
4870 // Replicate long (8 byte) scalar to be vector
4871 #ifdef _LP64
4872 instruct Repl4L_evex(vecY dst, rRegL src) %{
4873   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4874   match(Set dst (ReplicateL src));
4875   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4876   ins_encode %{
4877     int vector_len = 1;
4878     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4879   %}
4880   ins_pipe( pipe_slow );
4881 %}
4882 
4883 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4884   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4885   match(Set dst (ReplicateL src));
4886   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4887   ins_encode %{
4888     int vector_len = 2;
4889     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4890   %}
4891   ins_pipe( pipe_slow );
4892 %}
4893 #else // _LP64
4894 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4895   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4896   match(Set dst (ReplicateL src));
4897   effect(TEMP dst, USE src, TEMP tmp);
4898   format %{ "movdl   $dst,$src.lo\n\t"
4899             "movdl   $tmp,$src.hi\n\t"
4900             "punpckldq $dst,$tmp\n\t"
4901             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4902   ins_encode %{
4903     int vector_len = 1;
4904     __ movdl($dst$$XMMRegister, $src$$Register);
4905     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4906     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4907     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4908   %}
4909   ins_pipe( pipe_slow );
4910 %}
4911 
4912 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4913   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4914   match(Set dst (ReplicateL src));
4915   effect(TEMP dst, USE src, TEMP tmp);
4916   format %{ "movdl   $dst,$src.lo\n\t"
4917             "movdl   $tmp,$src.hi\n\t"
4918             "punpckldq $dst,$tmp\n\t"
4919             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4920   ins_encode %{
4921     int vector_len = 2;
4922     __ movdl($dst$$XMMRegister, $src$$Register);
4923     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4924     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4925     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4926   %}
4927   ins_pipe( pipe_slow );
4928 %}
4929 #endif // _LP64
4930 
4931 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4932   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4933   match(Set dst (ReplicateL con));
4934   format %{ "movq    $dst,[$constantaddress]\n\t"
4935             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4936   ins_encode %{
4937     int vector_len = 1;
4938     __ movq($dst$$XMMRegister, $constantaddress($con));
4939     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4940   %}
4941   ins_pipe( pipe_slow );
4942 %}
4943 
4944 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4945   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4946   match(Set dst (ReplicateL con));
4947   format %{ "movq    $dst,[$constantaddress]\n\t"
4948             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4949   ins_encode %{
4950     int vector_len = 2;
4951     __ movq($dst$$XMMRegister, $constantaddress($con));
4952     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4953   %}
4954   ins_pipe( pipe_slow );
4955 %}
4956 
4957 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4958   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4959   match(Set dst (ReplicateL (LoadL mem)));
4960   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4961   ins_encode %{
4962     int vector_len = 0;
4963     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4964   %}
4965   ins_pipe( pipe_slow );
4966 %}
4967 
4968 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4969   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4970   match(Set dst (ReplicateL (LoadL mem)));
4971   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4972   ins_encode %{
4973     int vector_len = 1;
4974     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4975   %}
4976   ins_pipe( pipe_slow );
4977 %}
4978 
4979 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4980   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4981   match(Set dst (ReplicateL (LoadL mem)));
4982   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4983   ins_encode %{
4984     int vector_len = 2;
4985     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4986   %}
4987   ins_pipe( pipe_slow );
4988 %}
4989 
4990 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4991   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4992   match(Set dst (ReplicateL zero));
4993   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4994   ins_encode %{
4995     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4996     int vector_len = 2;
4997     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4998   %}
4999   ins_pipe( fpu_reg_reg );
5000 %}
5001 
5002 instruct Repl8F_evex(vecY dst, regF src) %{
5003   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
5004   match(Set dst (ReplicateF src));
5005   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
5006   ins_encode %{
5007     int vector_len = 1;
5008     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5009   %}
5010   ins_pipe( pipe_slow );
5011 %}
5012 
5013 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
5014   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
5015   match(Set dst (ReplicateF (LoadF mem)));
5016   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
5017   ins_encode %{
5018     int vector_len = 1;
5019     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5020   %}
5021   ins_pipe( pipe_slow );
5022 %}
5023 
5024 instruct Repl16F_evex(vecZ dst, regF src) %{
5025   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5026   match(Set dst (ReplicateF src));
5027   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
5028   ins_encode %{
5029     int vector_len = 2;
5030     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5031   %}
5032   ins_pipe( pipe_slow );
5033 %}
5034 
5035 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
5036   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5037   match(Set dst (ReplicateF (LoadF mem)));
5038   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
5039   ins_encode %{
5040     int vector_len = 2;
5041     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5042   %}
5043   ins_pipe( pipe_slow );
5044 %}
5045 
5046 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
5047   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5048   match(Set dst (ReplicateF zero));
5049   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
5050   ins_encode %{
5051     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5052     int vector_len = 2;
5053     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5054   %}
5055   ins_pipe( fpu_reg_reg );
5056 %}
5057 
5058 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
5059   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5060   match(Set dst (ReplicateF zero));
5061   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
5062   ins_encode %{
5063     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5064     int vector_len = 2;
5065     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5066   %}
5067   ins_pipe( fpu_reg_reg );
5068 %}
5069 
5070 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
5071   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5072   match(Set dst (ReplicateF zero));
5073   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
5074   ins_encode %{
5075     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5076     int vector_len = 2;
5077     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5078   %}
5079   ins_pipe( fpu_reg_reg );
5080 %}
5081 
5082 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
5083   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5084   match(Set dst (ReplicateF zero));
5085   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
5086   ins_encode %{
5087     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5088     int vector_len = 2;
5089     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5090   %}
5091   ins_pipe( fpu_reg_reg );
5092 %}
5093 
5094 instruct Repl4D_evex(vecY dst, regD src) %{
5095   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
5096   match(Set dst (ReplicateD src));
5097   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
5098   ins_encode %{
5099     int vector_len = 1;
5100     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5101   %}
5102   ins_pipe( pipe_slow );
5103 %}
5104 
5105 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
5106   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
5107   match(Set dst (ReplicateD (LoadD mem)));
5108   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
5109   ins_encode %{
5110     int vector_len = 1;
5111     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5112   %}
5113   ins_pipe( pipe_slow );
5114 %}
5115 
5116 instruct Repl8D_evex(vecZ dst, regD src) %{
5117   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5118   match(Set dst (ReplicateD src));
5119   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
5120   ins_encode %{
5121     int vector_len = 2;
5122     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5123   %}
5124   ins_pipe( pipe_slow );
5125 %}
5126 
5127 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
5128   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5129   match(Set dst (ReplicateD (LoadD mem)));
5130   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
5131   ins_encode %{
5132     int vector_len = 2;
5133     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5134   %}
5135   ins_pipe( pipe_slow );
5136 %}
5137 
5138 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
5139   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5140   match(Set dst (ReplicateD zero));
5141   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
5142   ins_encode %{
5143     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5144     int vector_len = 2;
5145     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5146   %}
5147   ins_pipe( fpu_reg_reg );
5148 %}
5149 
5150 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
5151   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5152   match(Set dst (ReplicateD zero));
5153   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
5154   ins_encode %{
5155     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5156     int vector_len = 2;
5157     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5158   %}
5159   ins_pipe( fpu_reg_reg );
5160 %}
5161 
5162 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
5163   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5164   match(Set dst (ReplicateD zero));
5165   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
5166   ins_encode %{
5167     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5168     int vector_len = 2;
5169     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5170   %}
5171   ins_pipe( fpu_reg_reg );
5172 %}
5173 
5174 // ====================VECTOR INSERT=======================================
5175 
5176 instruct rvinsert8B(vecD dst, vecD src, rRegI val, immU3 idx) %{
5177   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5178   match(Set dst (VectorInsert (Binary src val) idx));
5179   effect(TEMP dst);
5180   format %{ "movdqu  $dst,$src\n\t"
5181             "pinsrb  $dst,$val\t! Insert 8B" %}
5182   ins_encode %{
5183     if ($dst$$XMMRegister != $src$$XMMRegister) {
5184       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5185     }
5186     __ pinsrb($dst$$XMMRegister, $val$$Register, $idx$$constant);
5187   %}
5188   ins_pipe( pipe_slow );
5189 %}
5190 
5191 instruct rvinsert16B(vecX dst, vecX src, rRegI val, immU4 idx) %{
5192   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5193   match(Set dst (VectorInsert (Binary src val) idx));
5194   effect(TEMP dst);
5195   format %{ "movdqu  $dst,$src\n\t"
5196             "pinsrb  $dst,$val\t! Insert 16B" %}
5197   ins_encode %{
5198     if ($dst$$XMMRegister != $src$$XMMRegister) {
5199       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5200     }
5201     __ pinsrb($dst$$XMMRegister, $val$$Register, $idx$$constant);
5202   %}
5203   ins_pipe( pipe_slow );
5204 %}
5205 
5206 instruct rvinsert16B_avx(vecX dst, vecX src, rRegI val, immU4 idx) %{
5207   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5208   match(Set dst (VectorInsert (Binary src val) idx));
5209   effect(TEMP dst);
5210   format %{ "vmovdqu  $dst,$src\n\t"
5211             "vpinsrb  $dst,$dst,$val\t! Insert 16B" %}
5212   ins_encode %{
5213     if ($dst$$XMMRegister != $src$$XMMRegister) {
5214       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5215     }
5216     __ vpinsrb($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5217   %}
5218   ins_pipe( pipe_slow );
5219 %}
5220 
5221 instruct rvinsert32B(vecY dst, vecY src, vecY tmp, rRegI val, immU5 idx) %{
5222   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5223   match(Set dst (VectorInsert (Binary src val) idx));
5224   effect(TEMP dst, TEMP tmp);
5225   format %{"vmovdqu  $dst,$src\n\t"
5226            "vextracti128  $tmp,$src\n\t"
5227            "vpinsrb  $tmp,$tmp,$val\n\t"
5228            "vinserti128  $dst,$tmp\t! Insert 32B" %}
5229   ins_encode %{
5230     uint x_idx = $idx$$constant & right_n_bits(4);
5231     uint y_idx = ($idx$$constant >> 4) & 1;
5232 
5233     if ($dst$$XMMRegister != $src$$XMMRegister) {
5234       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5235     }
5236     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5237     __ vpinsrb($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5238     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5239   %}
5240   ins_pipe( pipe_slow );
5241 %}
5242 
5243 instruct rvinsert64B(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU6 idx) %{
5244   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5245   match(Set dst (VectorInsert (Binary src val) idx));
5246   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5247   format %{ "evmovdquq  $dst,$src\n\t"
5248             "vextracti64x4  $tmp,$src\n\t"
5249             "vextracti128  $tmp1,$tmp\n\t"
5250             "vpinsrb  $tmp1,$tmp1,$val\n\t"
5251             "vinserti128  $tmp,$tmp,$tmp1\n\t"
5252             "vinserti64x4  $dst,$dst,$tmp\t! Insert 64B" %}
5253   ins_encode %{
5254     uint x_idx = $idx$$constant & right_n_bits(4);
5255     uint y_idx = ($idx$$constant >> 4) & 1;
5256     uint z_idx = ($idx$$constant >> 5) & 1;
5257 
5258     if ($dst$$XMMRegister != $src$$XMMRegister) {
5259       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5260     }
5261     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5262     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5263     __ vpinsrb($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5264     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5265     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5266   %}
5267   ins_pipe( pipe_slow );
5268 %}
5269 
5270 instruct rvinsert4S(vecD dst, vecD src, rRegI val, immU2 idx) %{
5271   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5272   match(Set dst (VectorInsert (Binary src val) idx));
5273   effect(TEMP dst);
5274   format %{ "movdqu  $dst,$src\n\t"
5275             "pinsrw  $dst,$val\t! Insert 4S" %}
5276   ins_encode %{
5277     if ($dst$$XMMRegister != $src$$XMMRegister) {
5278       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5279     }
5280     __ pinsrw($dst$$XMMRegister, $val$$Register, $idx$$constant);
5281   %}
5282   ins_pipe( pipe_slow );
5283 %}
5284 
5285 instruct rvinsert8S(vecX dst, vecX src, rRegI val, immU3 idx) %{
5286   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5287   match(Set dst (VectorInsert (Binary src val) idx));
5288   effect(TEMP dst);
5289   format %{ "movdqu  $dst,$src\n\t"
5290             "pinsrw  $dst,$val\t! Insert 8S" %}
5291   ins_encode %{
5292     if ($dst$$XMMRegister != $src$$XMMRegister) {
5293       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5294     }
5295     __ pinsrw($dst$$XMMRegister, $val$$Register, $idx$$constant);
5296   %}
5297   ins_pipe( pipe_slow );
5298 %}
5299 
5300 instruct rvinsert8S_avx(vecX dst, vecX src, rRegI val, immU3 idx) %{
5301   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5302   match(Set dst (VectorInsert (Binary src val) idx));
5303   effect(TEMP dst);
5304   format %{ "vmovdqu  $dst,$src\n\t"
5305             "vpinsrw  $dst,$dst,$val\t! Insert 8S" %}
5306   ins_encode %{
5307     if ($dst$$XMMRegister != $src$$XMMRegister) {
5308       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5309     }
5310     __ vpinsrw($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5311   %}
5312   ins_pipe( pipe_slow );
5313 %}
5314 
5315 
5316 instruct rvinsert16S(vecY dst, vecY src, vecX tmp, rRegI val, immU4 idx) %{
5317   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5318   match(Set dst (VectorInsert (Binary src val) idx));
5319   effect(TEMP dst, TEMP tmp);
5320   format %{ "vmovdqu  $dst,$src\n\t"
5321             "vextracti128  $tmp,$src\n\t"
5322             "vpinsrw  $tmp,$tmp,$val\n\t"
5323             "vinserti128  $dst,$dst,$tmp\t! Insert 16S" %}
5324   ins_encode %{
5325     uint x_idx = $idx$$constant & right_n_bits(3);
5326     uint y_idx = ($idx$$constant >> 3) & 1;
5327 
5328     if ($dst$$XMMRegister != $src$$XMMRegister) {
5329       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5330     }
5331     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5332     __ vpinsrw($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5333     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5334   %}
5335   ins_pipe( pipe_slow );
5336 %}
5337 
5338 instruct rvinsert32S(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU5 idx) %{
5339   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5340   match(Set dst (VectorInsert (Binary src val) idx));
5341   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5342   format %{ "evmovdquq  $dst,$src\n\t"
5343             "vextracti64x4  $tmp,$src\n\t"
5344             "vextracti128  $tmp1,$tmp\n\t"
5345             "vpinsrw  $tmp1,$tmp1,$val\n\t"
5346             "vinserti128  $tmp,$tmp,$tmp1\n\t"
5347             "vinserti64x4  $dst,$dst,$tmp\t! Insert 32S" %}
5348   ins_encode %{
5349     uint x_idx = $idx$$constant & right_n_bits(3);
5350     uint y_idx = ($idx$$constant >> 3) & 1;
5351     uint z_idx = ($idx$$constant >> 4) & 1;
5352 
5353     if ($dst$$XMMRegister != $src$$XMMRegister) {
5354       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5355     }
5356     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5357     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5358     __ vpinsrw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5359     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5360     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5361   %}
5362   ins_pipe( pipe_slow );
5363 %}
5364 
5365 instruct rvinsert2I(vecD dst, vecD src, rRegI val, immU1 idx) %{
5366   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5367   match(Set dst (VectorInsert (Binary src val) idx));
5368   effect(TEMP dst);
5369   format %{ "movdqu  $dst,$src\n\t"
5370             "pinsrd  $dst,$val\t! Insert 2I" %}
5371   ins_encode %{
5372     if ($dst$$XMMRegister != $src$$XMMRegister) {
5373       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5374     }
5375     __ pinsrd($dst$$XMMRegister, $val$$Register, $idx$$constant);
5376   %}
5377   ins_pipe( pipe_slow );
5378 %}
5379 
5380 instruct rvinsert4I(vecX dst, vecX src, rRegI val, immU2 idx) %{
5381   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5382   match(Set dst (VectorInsert (Binary src val) idx));
5383   effect(TEMP dst);
5384   format %{ "movdqu  $dst,$src\n\t"
5385             "pinsrd  $dst,$val\t! Insert 4I" %}
5386   ins_encode %{
5387     if ($dst$$XMMRegister != $src$$XMMRegister) {
5388       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5389     }
5390     __ pinsrd($dst$$XMMRegister, $val$$Register, $idx$$constant);
5391   %}
5392   ins_pipe( pipe_slow );
5393 %}
5394 
5395 instruct rvinsert4I_avx(vecX dst, vecX src, rRegI val, immU2 idx) %{
5396   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5397   match(Set dst (VectorInsert (Binary src val) idx));
5398   effect(TEMP dst);
5399   format %{ "vmovdqu  $dst,$src\n\t"
5400             "vpinsrd  $dst,$val\t! Insert 4I" %}
5401   ins_encode %{
5402     if ($dst$$XMMRegister != $src$$XMMRegister) {
5403       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5404     }
5405     __ vpinsrd($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5406   %}
5407   ins_pipe( pipe_slow );
5408 %}
5409 
5410 instruct rvinsert8I(vecY dst, vecY src, vecY tmp, rRegI val, immU3 idx) %{
5411   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5412   match(Set dst (VectorInsert (Binary src val) idx));
5413   effect(TEMP dst, TEMP tmp);
5414   format %{ "vmovdqu  $dst,$src\n\t"
5415             "vextracti128  $tmp,$src\n\t"
5416             "vpinsrd  $tmp,$tmp,$val\n\t"
5417             "vinserti128  $dst,$dst,$tmp\t! Insert 8I" %}
5418   ins_encode %{
5419     uint x_idx = $idx$$constant & right_n_bits(2);
5420     uint y_idx = ($idx$$constant >> 2) & 1;
5421 
5422     if ($dst$$XMMRegister != $src$$XMMRegister) {
5423       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5424     }
5425     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5426     __ vpinsrd($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5427     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5428   %}
5429   ins_pipe( pipe_slow );
5430 %}
5431 
5432 instruct rvinsert16I(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU4 idx) %{
5433   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5434   match(Set dst (VectorInsert (Binary src val) idx));
5435   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5436   format %{ "evmovdquq  $dst,$src\n\t"
5437             "vextracti64x4  $tmp,$src\n\t"
5438             "vextracti128  $tmp,$tmp\n\t"
5439             "vpinsrd  $tmp,$tmp,$val\n\t"
5440             "vinserti128  $tmp,$tmp,$tmp\n\t"
5441             "vinserti64x4  $dst,$dst,$tmp\t! Insert 16I" %}
5442   ins_encode %{
5443     uint x_idx = $idx$$constant & right_n_bits(2);
5444     uint y_idx = ($idx$$constant >> 2) & 1;
5445     uint z_idx = ($idx$$constant >> 3) & 1;
5446 
5447     if ($dst$$XMMRegister != $src$$XMMRegister) {
5448       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5449     }
5450     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5451     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5452     __ vpinsrd($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5453     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5454     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5455   %}
5456   ins_pipe( pipe_slow );
5457 %}
5458 
5459 instruct rvinsert1L(vecD dst, vecD src, rRegL val, immI0 idx) %{
5460   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5461   match(Set dst (VectorInsert (Binary src val) idx));
5462   effect(TEMP dst);
5463   format %{ "movdqu  $dst,$src\n\t"
5464             "pinsrq  $dst,$val\t! Insert 1L" %}
5465   ins_encode %{
5466     if ($dst$$XMMRegister != $src$$XMMRegister) {
5467       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5468     }
5469     __ pinsrq($dst$$XMMRegister, $val$$Register, 0);
5470   %}
5471   ins_pipe( pipe_slow );
5472 %}
5473 
5474 instruct rvinsert2L(vecX dst, vecX src, rRegL val, immU1 idx) %{
5475   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5476   match(Set dst (VectorInsert (Binary src val) idx));
5477   effect(TEMP dst);
5478   format %{ "movdqu  $dst,$src\n\t"
5479             "pinsrq  $dst,$dst\t! Insert 2L" %}
5480   ins_encode %{
5481     if ($dst$$XMMRegister != $src$$XMMRegister) {
5482       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5483     }
5484     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
5485   %}
5486   ins_pipe( pipe_slow );
5487 %}
5488 
5489 instruct rvinsert2L_avx(vecX dst, vecX src, rRegL val, immU1 idx) %{
5490   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5491   match(Set dst (VectorInsert (Binary src val) idx));
5492   effect(TEMP dst);
5493   format %{ "vmovdqu  $dst,$src\n\t"
5494             "vpinsrq  $dst,$dst,$val\t! Insert 2L" %}
5495   ins_encode %{
5496     if ($dst$$XMMRegister != $src$$XMMRegister) {
5497       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5498     }
5499     __ vpinsrq($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5500   %}
5501   ins_pipe( pipe_slow );
5502 %}
5503 
5504 instruct rvinsert4L(vecY dst, vecY src, vecY tmp, rRegL val, immU2 idx) %{
5505   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5506   match(Set dst (VectorInsert (Binary src val) idx));
5507   effect(TEMP dst, TEMP tmp);
5508   format %{ "vmovdqu  $dst,$src\n\t"
5509             "vextracti128  $tmp,$src\n\t"
5510             "vpinsrq  $tmp,$tmp,$val\n\t"
5511             "vinserti128  $dst,$dst,$tmp\t! Insert 4L" %}
5512   ins_encode %{
5513     uint x_idx = $idx$$constant & 1;
5514     uint y_idx = ($idx$$constant >> 1) & 1;
5515 
5516     if ($dst$$XMMRegister != $src$$XMMRegister) {
5517       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5518     }
5519     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5520     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5521     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5522   %}
5523   ins_pipe( pipe_slow );
5524 %}
5525 
5526 instruct rvinsert8L(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegL val, immU3 idx) %{
5527   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5528   match(Set dst (VectorInsert (Binary src val) idx));
5529   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5530   format %{ "evmovdquq  $dst,$src\n\t"
5531             "vextracti64x4  $tmp,$src\n\t"
5532             "vextracti128  $tmp,$tmp\n\t"
5533             "vpinsrq  $tmp,$tmp,$val\n\t"
5534             "vinserti128  $tmp,$tmp,$tmp\n\t"
5535             "vinserti64x4  $dst,$dst,$tmp\t! Insert 8L" %}
5536   ins_encode %{
5537     uint x_idx = $idx$$constant & 1;
5538     uint y_idx = ($idx$$constant >> 1) & 1;
5539     uint z_idx = ($idx$$constant >> 2) & 1;
5540 
5541     if ($dst$$XMMRegister != $src$$XMMRegister) {
5542       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5543     }
5544     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5545     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5546     __ vpinsrq($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5547     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5548     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5549   %}
5550   ins_pipe( pipe_slow );
5551 %}
5552 
5553 instruct rvinsert2F(vecD dst, vecD src, regF val, immU1 idx) %{
5554   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5555   match(Set dst (VectorInsert (Binary src val) idx));
5556   effect(TEMP dst);
5557   format %{ "movdqu  $dst,$src\n\t"
5558             "insertps  $dst,$dst,$val\t! Insert 2F" %}
5559   ins_encode %{
5560     if ($dst$$XMMRegister != $src$$XMMRegister) {
5561       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5562     }
5563     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
5564   %}
5565   ins_pipe( pipe_slow );
5566 %}
5567 
5568 instruct rvinsert2F_avx(vecD dst, vecD src, regF val, immU1 idx) %{
5569   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5570   match(Set dst (VectorInsert (Binary src val) idx));
5571   effect(TEMP dst);
5572   format %{ "movdqu  $dst,$src\n\t"
5573             "insertps  $dst,$dst,$val\t! Insert 2F" %}
5574   ins_encode %{
5575     if ($dst$$XMMRegister != $src$$XMMRegister) {
5576       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5577     }
5578     __ vinsertps($dst$$XMMRegister, $dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
5579   %}
5580   ins_pipe( pipe_slow );
5581 %}
5582 
5583 instruct rvinsert4F(vecX dst, vecX src, regF val, immU2 idx) %{
5584   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5585   match(Set dst (VectorInsert (Binary src val) idx));
5586   effect(TEMP dst);
5587   format %{ "movdqu  $dst,$src\n\t"
5588             "insertps  $dst,$dst,$val\t! Insert 4F" %}
5589   ins_encode %{
5590     if ($dst$$XMMRegister != $src$$XMMRegister) {
5591       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5592     }
5593     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
5594   %}
5595   ins_pipe( pipe_slow );
5596 %}
5597 
5598 instruct rvinsert4F_avx(vecX dst, vecX src, regF val, immU2 idx) %{
5599   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5600   match(Set dst (VectorInsert (Binary src val) idx));
5601   effect(TEMP dst);
5602   format %{ "vmovdqu  $dst,$src\n\t"
5603             "vinsertps  $dst,$dst,$val\t! Insert 4F" %}
5604   ins_encode %{
5605     if ($dst$$XMMRegister != $src$$XMMRegister) {
5606       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5607     }
5608     __ vinsertps($dst$$XMMRegister, $dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
5609   %}
5610   ins_pipe( pipe_slow );
5611 %}
5612 
5613 instruct rvinsert8F(vecY dst, vecY src, vecY tmp, regF val, immU3 idx) %{
5614   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5615   match(Set dst (VectorInsert (Binary src val) idx));
5616   effect(TEMP dst, TEMP tmp);
5617   format %{ "vmovdqu  $dst,$src\n\t"
5618             "vextractf128  $tmp,$src\n\t"
5619             "vinsertps  $tmp,$tmp,$val\n\t"
5620             "vinsertf128  $dst,$dst,$tmp\t! Insert 8F" %}
5621   ins_encode %{
5622     uint x_idx = $idx$$constant & right_n_bits(2);
5623     uint y_idx = ($idx$$constant >> 2) & 1;
5624 
5625     if ($dst$$XMMRegister != $src$$XMMRegister) {
5626       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5627     }
5628     __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5629     __ vinsertps($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$XMMRegister, x_idx);
5630     __ vinsertf128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5631   %}
5632   ins_pipe( pipe_slow );
5633 %}
5634 
5635 instruct rvinsert16F(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, regF val, immU4 idx) %{
5636   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5637   match(Set dst (VectorInsert (Binary src val) idx));
5638   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5639   format %{ "evmovdquq  $dst,$src\n\t"
5640             "vextractf128  $tmp,$src\n\t"
5641             "vinsertps  $tmp,$tmp,$val\n\t"
5642             "movsbl  $dst,$dst\t! Insert 4I" %}
5643   ins_encode %{
5644     uint x_idx = $idx$$constant & right_n_bits(2);
5645     uint y_idx = ($idx$$constant >> 2) & 1;
5646     uint z_idx = ($idx$$constant >> 3) & 1;
5647 
5648     if ($dst$$XMMRegister != $src$$XMMRegister) {
5649       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5650     }
5651     __ vextractf64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5652     __ vextractf128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5653     __ vinsertps($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$XMMRegister, x_idx);
5654     __ vinsertf128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5655     __ vinsertf64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5656   %}
5657   ins_pipe( pipe_slow );
5658 %}
5659 
5660 instruct rvinsert1D(vecD dst, vecD src, regD val, rRegL tmp, immI0 idx) %{
5661   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
5662   match(Set dst (VectorInsert (Binary src val) idx));
5663   effect(TEMP dst, TEMP tmp);
5664   format %{ "movdqu  $dst,$src\n\t"
5665             "movq $tmp,$val\n\t"
5666             "pinsrq  $dst,$tmp\t! Insert 1D" %}
5667   ins_encode %{
5668     if ($dst$$XMMRegister != $src$$XMMRegister) {
5669       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5670     }
5671     __ movq($tmp$$Register, $val$$XMMRegister);
5672     __ pinsrq($dst$$XMMRegister, $tmp$$Register, 0);
5673   %}
5674   ins_pipe( pipe_slow );
5675 %}
5676 
5677 instruct rvinsert2D(vecX dst, vecX src, regD val, rRegL tmp, immU1 idx) %{
5678   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
5679   match(Set dst (VectorInsert (Binary src val) idx));
5680   effect(TEMP dst, TEMP tmp);
5681   format %{ "movdqu  $dst,$src\n\t"
5682             "movq  $dst,$src\n\t"
5683             "pinsrq  $dst,$dst\t! Insert 2D" %}
5684   ins_encode %{
5685     if ($dst$$XMMRegister != $src$$XMMRegister) {
5686       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5687     }
5688     __ movq($tmp$$Register, $val$$XMMRegister);
5689     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
5690   %}
5691   ins_pipe( pipe_slow );
5692 %}
5693 
5694 instruct rvinsert2D_avx(vecX dst, vecX src, regD val, rRegL tmp, immU1 idx) %{
5695   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
5696   match(Set dst (VectorInsert (Binary src val) idx));
5697   effect(TEMP dst, TEMP tmp);
5698   format %{ "vmovdqu  $dst,$src\n\t"
5699             "movq  $tmp,$val\n\t"
5700             "vpinsrq  $dst,$dst,$tmp\t! Insert 2D" %}
5701   ins_encode %{
5702     if ($dst$$XMMRegister != $src$$XMMRegister) {
5703       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5704     }
5705     __ movq($tmp$$Register, $val$$XMMRegister);
5706     __ vpinsrq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$Register, $idx$$constant);
5707   %}
5708   ins_pipe( pipe_slow );
5709 %}
5710 
5711 instruct rvinsert4D(vecY dst, vecY src, vecY tmp, regD val, rRegL tmp1, immU2 idx) %{
5712   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
5713   match(Set dst (VectorInsert (Binary src val) idx));
5714   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5715   format %{ "vmovdqu  $dst,$src\n\t"
5716             "vextracti128  $tmp,$src\n\t"
5717             "movq $tmp1,$val\n\t"
5718             "vpinsrq  $tmp,$tmp,$tmp1\n\t"
5719             "vinserti128  $dst,$dst,$tmp\t! Insert 4D" %}
5720   ins_encode %{
5721     uint x_idx = $idx$$constant & 1;
5722     uint y_idx = ($idx$$constant >> 1) & 1;
5723 
5724     if ($dst$$XMMRegister != $src$$XMMRegister) {
5725       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5726     }
5727     __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5728     __ movq($tmp1$$Register, $val$$XMMRegister);
5729     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$Register, x_idx);
5730     __ vinsertf128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5731   %}
5732   ins_pipe( pipe_slow );
5733 %}
5734 
5735 instruct rvinsert8D(vecZ dst, vecZ src, vecZ tmp, vecY tmp2, regD val, rRegL tmp1, immU3 idx) %{
5736   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
5737   match(Set dst (VectorInsert (Binary src val) idx));
5738   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2);
5739   format %{ "evmovdquq  $dst,$src\n\t"
5740             "vextractf64x4  $tmp,$src\n\t"
5741             "vextractf128  $tmp,$tmp\n\t"
5742             "movq $tmp1,$val\n\t"
5743             "vpinsrq  $tmp,$tmp,$val\n\t"
5744             "vinsertf128  $tmp,$tmp,$tmp\n\t"
5745             "vinsertf64x4  $dst,$dst,$tmp\t! Insert 8D" %}
5746   ins_encode %{
5747     uint x_idx = $idx$$constant & 1;
5748     uint y_idx = ($idx$$constant >> 1) & 1;
5749     uint z_idx = ($idx$$constant >> 2) & 1;
5750 
5751     if ($dst$$XMMRegister != $src$$XMMRegister) {
5752       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5753     }
5754     __ vextractf64x4($tmp2$$XMMRegister, $src$$XMMRegister, z_idx);
5755     __ vextractf128($tmp$$XMMRegister, $tmp2$$XMMRegister, y_idx);
5756     __ movq($tmp1$$Register, $val$$XMMRegister);
5757     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$Register, x_idx);
5758     __ vinsertf128($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, y_idx);
5759     __ vinsertf64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, z_idx);
5760   %}
5761   ins_pipe( pipe_slow );
5762 %}
5763 
5764 // ====================REDUCTION ARITHMETIC=======================================
5765 
5766 instruct rsadd8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
5767   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5768   match(Set dst (AddReductionVI src1 src2));
5769   effect(TEMP tmp, TEMP tmp2, TEMP dst);
5770   format %{
5771             "pshufd  $tmp,$src2,0x1\n\t"
5772             "paddb   $tmp,$src2\n\t"
5773             "movzbl  $dst,$src1\n\t"
5774             "pextrb  $tmp2,$tmp, 0x0\n\t"
5775             "addl    $dst,$tmp2\n\t"
5776             "pextrb  $tmp2,$tmp, 0x1\n\t"
5777             "addl    $dst,$tmp2\n\t"
5778             "pextrb  $tmp2,$tmp, 0x2\n\t"
5779             "addl    $dst,$tmp2\n\t"
5780             "pextrb  $tmp2,$tmp, 0x3\n\t"
5781             "addl    $dst,$tmp2\n\t"
5782             "movsbl  $dst,$dst\t! add reduction8B" %}
5783   ins_encode %{
5784     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
5785     __ paddb($tmp$$XMMRegister, $src2$$XMMRegister);
5786     __ movzbl($dst$$Register, $src1$$Register);
5787     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
5788     __ addl($dst$$Register, $tmp2$$Register);
5789     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
5790     __ addl($dst$$Register, $tmp2$$Register);
5791     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
5792     __ addl($dst$$Register, $tmp2$$Register);
5793     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
5794     __ addl($dst$$Register, $tmp2$$Register);
5795     __ movsbl($dst$$Register, $dst$$Register);
5796   %}
5797   ins_pipe( pipe_slow );
5798 %}
5799 
5800 instruct rsadd16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
5801   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5802   match(Set dst (AddReductionVI src1 src2));
5803   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
5804   format %{ "pshufd  $tmp,$src2,0xE\n\t"
5805             "paddb   $tmp,$src2\n\t"
5806             "pshufd  $tmp2,$tmp,0x1\n\t"
5807             "paddb   $tmp,$tmp,$tmp2\n\t"
5808             "movzbl  $dst,$src1\n\t"
5809             "pextrb  $tmp3,$tmp, 0x0\n\t"
5810             "addl    $dst,$tmp3\n\t"
5811             "pextrb  $tmp3,$tmp, 0x1\n\t"
5812             "addl    $dst,$tmp3\n\t"
5813             "pextrb  $tmp3,$tmp, 0x2\n\t"
5814             "addl    $dst,$tmp3\n\t"
5815             "pextrb  $tmp3,$tmp, 0x3\n\t"
5816             "addl    $dst,$tmp3\n\t"
5817             "movsbl  $dst,$dst\t! add reduction16B" %}
5818   ins_encode %{
5819     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5820     __ paddb($tmp$$XMMRegister, $src2$$XMMRegister);
5821     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5822     __ paddb($tmp$$XMMRegister, $tmp2$$XMMRegister);
5823     __ movzbl($dst$$Register, $src1$$Register);
5824     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
5825     __ addl($dst$$Register, $tmp3$$Register);
5826     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
5827     __ addl($dst$$Register, $tmp3$$Register);
5828     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
5829     __ addl($dst$$Register, $tmp3$$Register);
5830     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
5831     __ addl($dst$$Register, $tmp3$$Register);
5832     __ movsbl($dst$$Register, $dst$$Register);
5833   %}
5834   ins_pipe( pipe_slow );
5835 %}
5836 
5837 instruct rvadd32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
5838   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5839   match(Set dst (AddReductionVI src1 src2));
5840   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
5841   format %{ "vextracti128_high  $tmp,$src2\n\t"
5842             "vpaddb  $tmp,$tmp,$src2\n\t"
5843             "pshufd  $tmp2,$tmp,0xE\n\t"
5844             "vpaddb  $tmp,$tmp,$tmp2\n\t"
5845             "pshufd  $tmp2,$tmp,0x1\n\t"
5846             "vpaddb  $tmp,$tmp,$tmp2\n\t"
5847             "movzbl  $dst,$src1\n\t"
5848             "pextrb  $tmp3,$tmp, 0x0\n\t"
5849             "addl    $dst,$tmp3\n\t"
5850             "pextrb  $tmp3,$tmp, 0x1\n\t"
5851             "addl    $dst,$tmp3\n\t"
5852             "pextrb  $tmp3,$tmp, 0x2\n\t"
5853             "addl    $dst,$tmp3\n\t"
5854             "pextrb  $tmp3,$tmp, 0x3\n\t"
5855             "addl    $dst,$tmp3\n\t"
5856             "movsbl  $dst,$dst\t! add reduction32B" %}
5857   ins_encode %{
5858     int vector_len = 0;
5859     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5860     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5861     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5862     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5863     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5864     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5865     __ movzbl($dst$$Register, $src1$$Register);
5866     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
5867     __ addl($dst$$Register, $tmp3$$Register);
5868     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
5869     __ addl($dst$$Register, $tmp3$$Register);
5870     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
5871     __ addl($dst$$Register, $tmp3$$Register);
5872     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
5873     __ addl($dst$$Register, $tmp3$$Register);
5874     __ movsbl($dst$$Register, $dst$$Register);
5875   %}
5876   ins_pipe( pipe_slow );
5877 %}
5878 
5879 instruct rvadd64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
5880   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5881   match(Set dst (AddReductionVI src1 src2));
5882   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
5883   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5884             "vpaddb  $tmp2,$tmp2,$src2\n\t"
5885             "vextracti128_high  $tmp,$tmp2\n\t"
5886             "vpaddb  $tmp,$tmp,$tmp2\n\t"
5887             "pshufd  $tmp2,$tmp,0xE\n\t"
5888             "vpaddb  $tmp,$tmp,$tmp2\n\t"
5889             "pshufd  $tmp2,$tmp,0x1\n\t"
5890             "vpaddb  $tmp,$tmp,$tmp2\n\t"
5891             "movzbl  $dst,$src1\n\t"
5892             "movdl   $tmp3,$tmp\n\t"
5893             "addl    $dst,$tmp3\n\t"
5894             "shrl    $tmp3,0x8\n\t"
5895             "addl    $dst,$tmp3\n\t"
5896             "shrl    $tmp3,0x8\n\t"
5897             "addl    $dst,$tmp3\n\t"
5898             "shrl    $tmp3,0x8\n\t"
5899             "addl    $dst,$tmp3\n\t"
5900             "movsbl  $dst,$dst\t! add reduction64B" %}
5901   ins_encode %{
5902     int vector_len = 0;
5903     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5904     __ vpaddb($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5905     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5906     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5907     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5908     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5909     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5910     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5911     __ movzbl($dst$$Register, $src1$$Register);
5912     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
5913     __ addl($dst$$Register, $tmp3$$Register);
5914     __ shrl($tmp3$$Register, 8);
5915     __ addl($dst$$Register, $tmp3$$Register);
5916     __ shrl($tmp3$$Register, 8);
5917     __ addl($dst$$Register, $tmp3$$Register);
5918     __ shrl($tmp3$$Register, 8);
5919     __ addl($dst$$Register, $tmp3$$Register);
5920     __ movsbl($dst$$Register, $dst$$Register);
5921   %}
5922   ins_pipe( pipe_slow );
5923 %}
5924 
5925 instruct rsadd4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
5926   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5927   match(Set dst (AddReductionVI src1 src2));
5928   effect(TEMP tmp, TEMP tmp2, TEMP dst);
5929   format %{
5930             "movdqu   $tmp,$src2\n\t"
5931             "phaddw   $tmp,$tmp\n\t"
5932             "phaddw   $tmp,$tmp\n\t"
5933             "movzwl   $dst,$src1\n\t"
5934             "pextrw   $tmp2,$tmp, 0x0\n\t"
5935             "addw     $dst,$tmp2\n\t"
5936             "movswl  $dst,$dst\t! add reduction4S" %}
5937   ins_encode %{
5938     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5939     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
5940     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
5941     __ movzwl($dst$$Register, $src1$$Register);
5942     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
5943     __ addw($dst$$Register, $tmp2$$Register);
5944     __ movswl($dst$$Register, $dst$$Register);
5945   %}
5946   ins_pipe( pipe_slow );
5947 %}
5948 
5949 instruct rvadd4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
5950   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5951   match(Set dst (AddReductionVI src1 src2));
5952   effect(TEMP tmp, TEMP tmp2, TEMP dst);
5953   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
5954             "vphaddw  $tmp,$tmp,$tmp\n\t"
5955             "movzwl   $dst,$src1\n\t"
5956             "pextrw   $tmp2,$tmp, 0x0\n\t"
5957             "addw     $dst,$tmp2\n\t"
5958             "movswl  $dst,$dst\t! add reduction4S" %}
5959   ins_encode %{
5960     int vector_len = 0;
5961     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5962     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5963     __ movzwl($dst$$Register, $src1$$Register);
5964     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
5965     __ addw($dst$$Register, $tmp2$$Register);
5966     __ movswl($dst$$Register, $dst$$Register);
5967   %}
5968   ins_pipe( pipe_slow );
5969 %}
5970 
5971 instruct rsadd8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2) %{
5972   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5973   match(Set dst (AddReductionVI src1 src2));
5974   effect(TEMP tmp, TEMP tmp2, TEMP dst);
5975   format %{
5976             "movdqu   $tmp,$src2\n\t"
5977             "phaddw  $tmp,$tmp\n\t"
5978             "phaddw  $tmp,$tmp\n\t"
5979             "phaddw  $tmp,$tmp\n\t"
5980             "movzwl   $dst,$src1\n\t"
5981             "pextrw   $tmp2,$tmp, 0x0\n\t"
5982             "addw     $dst,$tmp2\n\t"
5983             "movswl  $dst,$dst\t! add reduction8S" %}
5984   ins_encode %{
5985     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5986     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
5987     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
5988     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
5989     __ movzwl($dst$$Register, $src1$$Register);
5990     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
5991     __ addw($dst$$Register, $tmp2$$Register);
5992     __ movswl($dst$$Register, $dst$$Register);
5993   %}
5994   ins_pipe( pipe_slow );
5995 %}
5996 
5997 instruct rvadd8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2) %{
5998   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5999   match(Set dst (AddReductionVI src1 src2));
6000   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6001   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6002             "vphaddw  $tmp,$tmp,$tmp\n\t"
6003             "vphaddw  $tmp,$tmp,$tmp\n\t"
6004             "movzwl   $dst,$src1\n\t"
6005             "pextrw   $tmp2,$tmp, 0x0\n\t"
6006             "addw     $dst,$tmp2\n\t"
6007             "movswl  $dst,$dst\t! add reduction8S" %}
6008   ins_encode %{
6009     int vector_len = 0;
6010     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6011     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6012     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6013     __ movzwl($dst$$Register, $src1$$Register);
6014     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6015     __ addw($dst$$Register, $tmp2$$Register);
6016     __ movswl($dst$$Register, $dst$$Register);
6017   %}
6018   ins_pipe( pipe_slow );
6019 %}
6020 
6021 instruct rvadd16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2) %{
6022   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6023   match(Set dst (AddReductionVI src1 src2));
6024   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6025   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6026             "vphaddw  $tmp,$tmp,$tmp\n\t"
6027             "vphaddw  $tmp,$tmp,$tmp\n\t"
6028             "vphaddw  $tmp,$tmp,$tmp\n\t"
6029             "movzwl   $dst,$src1\n\t"
6030             "pextrw   $tmp2,$tmp, 0x0\n\t"
6031             "addw     $dst,$tmp2\n\t"
6032             "movswl  $dst,$dst\t! add reduction16S" %}
6033   ins_encode %{
6034     int vector_len = 1;
6035     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6036     __ vpermq($tmp$$XMMRegister, $tmp$$XMMRegister, 0xD8, vector_len);
6037     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6038     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6039     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6040     __ movzwl($dst$$Register, $src1$$Register);
6041     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6042     __ addw($dst$$Register, $tmp2$$Register);
6043     __ movswl($dst$$Register, $dst$$Register);
6044   %}
6045   ins_pipe( pipe_slow );
6046 %}
6047 
6048 instruct rvadd32S_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
6049   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6050   match(Set dst (AddReductionVI src1 src2));
6051   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6052   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6053             "vpaddw  $tmp2,$tmp2,$src2\n\t"
6054             "vextracti128_high  $tmp,$tmp2\n\t"
6055             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6056             "pshufd  $tmp2,$tmp,0xE\n\t"
6057             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6058             "pshufd  $tmp2,$tmp,0x1\n\t"
6059             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6060             "movdl   $tmp3,$tmp\n\t"
6061             "addw    $dst,$tmp3\n\t"
6062             "shrl    $tmp3,0x16\n\t"
6063             "addw     $dst,$tmp3\n\t"
6064             "movswl  $dst,$dst\t! add reduction32S" %}
6065   ins_encode %{
6066     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6067     __ vpaddw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6068     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6069     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6070     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6071     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6072     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6073     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6074     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
6075     __ movzwl($dst$$Register, $src1$$Register);
6076     __ addw($dst$$Register, $tmp3$$Register);
6077     __ shrl($tmp3$$Register, 16);
6078     __ addw($dst$$Register, $tmp3$$Register);
6079     __ movswl($dst$$Register, $dst$$Register);
6080   %}
6081   ins_pipe( pipe_slow );
6082 %}
6083 
6084 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
6085   predicate(UseSSE > 2 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6086   match(Set dst (AddReductionVI src1 src2));
6087   effect(TEMP tmp2, TEMP tmp);
6088   format %{ "movdqu  $tmp2,$src2\n\t"
6089             "phaddd  $tmp2,$tmp2\n\t"
6090             "movd    $tmp,$src1\n\t"
6091             "paddd   $tmp,$tmp2\n\t"
6092             "movd    $dst,$tmp\t! add reduction2I" %}
6093   ins_encode %{
6094     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
6095     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
6096     __ movdl($tmp$$XMMRegister, $src1$$Register);
6097     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
6098     __ movdl($dst$$Register, $tmp$$XMMRegister);
6099   %}
6100   ins_pipe( pipe_slow );
6101 %}
6102 
6103 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
6104   predicate(VM_Version::supports_avxonly()  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6105   match(Set dst (AddReductionVI src1 src2));
6106   effect(TEMP tmp, TEMP tmp2);
6107   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6108             "movd     $tmp2,$src1\n\t"
6109             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6110             "movd     $dst,$tmp2\t! add reduction2I" %}
6111   ins_encode %{
6112     int vector_len = 0;
6113     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6114     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6115     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
6116     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6117   %}
6118   ins_pipe( pipe_slow );
6119 %}
6120 
6121 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
6122   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6123   match(Set dst (AddReductionVI src1 src2));
6124   effect(TEMP tmp, TEMP tmp2);
6125   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
6126             "vpaddd  $tmp,$src2,$tmp2\n\t"
6127             "movd    $tmp2,$src1\n\t"
6128             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6129             "movd    $dst,$tmp2\t! add reduction2I" %}
6130   ins_encode %{
6131     int vector_len = 0;
6132     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6133     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6134     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6135     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6136     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6137   %}
6138   ins_pipe( pipe_slow );
6139 %}
6140 
6141 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
6142   predicate(UseSSE > 2 && UseAVX == 0  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6143   match(Set dst (AddReductionVI src1 src2));
6144   effect(TEMP tmp, TEMP tmp2);
6145   format %{ "movdqu  $tmp,$src2\n\t"
6146             "phaddd  $tmp,$tmp\n\t"
6147             "phaddd  $tmp,$tmp\n\t"
6148             "movd    $tmp2,$src1\n\t"
6149             "paddd   $tmp2,$tmp\n\t"
6150             "movd    $dst,$tmp2\t! add reduction4I" %}
6151   ins_encode %{
6152     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6153     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
6154     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
6155     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6156     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
6157     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6158   %}
6159   ins_pipe( pipe_slow );
6160 %}
6161 
6162 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
6163   predicate(VM_Version::supports_avxonly() && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6164   match(Set dst (AddReductionVI src1 src2));
6165   effect(TEMP tmp, TEMP tmp2);
6166   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6167             "vphaddd  $tmp,$tmp,$tmp\n\t"
6168             "movd     $tmp2,$src1\n\t"
6169             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6170             "movd     $dst,$tmp2\t! add reduction4I" %}
6171   ins_encode %{
6172     int vector_len = 0;
6173     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6174     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6175     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6176     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
6177     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6178   %}
6179   ins_pipe( pipe_slow );
6180 %}
6181 
6182 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
6183   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6184   match(Set dst (AddReductionVI src1 src2));
6185   effect(TEMP tmp, TEMP tmp2);
6186   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
6187             "vpaddd  $tmp,$src2,$tmp2\n\t"
6188             "pshufd  $tmp2,$tmp,0x1\n\t"
6189             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6190             "movd    $tmp2,$src1\n\t"
6191             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6192             "movd    $dst,$tmp2\t! add reduction4I" %}
6193   ins_encode %{
6194     int vector_len = 0;
6195     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
6196     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6197     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6198     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6199     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6200     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6201     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6202   %}
6203   ins_pipe( pipe_slow );
6204 %}
6205 
6206 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
6207   predicate(VM_Version::supports_avxonly()  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6208   match(Set dst (AddReductionVI src1 src2));
6209   effect(TEMP tmp, TEMP tmp2);
6210   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6211             "vphaddd  $tmp,$tmp,$tmp2\n\t"
6212             "vextracti128_high  $tmp2,$tmp\n\t"
6213             "vpaddd   $tmp,$tmp,$tmp2\n\t"
6214             "movd     $tmp2,$src1\n\t"
6215             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6216             "movd     $dst,$tmp2\t! add reduction8I" %}
6217   ins_encode %{
6218     int vector_len = 1;
6219     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6220     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6221     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
6222     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6223     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6224     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6225     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6226   %}
6227   ins_pipe( pipe_slow );
6228 %}
6229 
6230 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
6231   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6232   match(Set dst (AddReductionVI src1 src2));
6233   effect(TEMP tmp, TEMP tmp2);
6234   format %{ "vextracti128_high  $tmp,$src2\n\t"
6235             "vpaddd  $tmp,$tmp,$src2\n\t"
6236             "pshufd  $tmp2,$tmp,0xE\n\t"
6237             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6238             "pshufd  $tmp2,$tmp,0x1\n\t"
6239             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6240             "movd    $tmp2,$src1\n\t"
6241             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6242             "movd    $dst,$tmp2\t! add reduction8I" %}
6243   ins_encode %{
6244     int vector_len = 0;
6245     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6246     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
6247     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6248     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6249     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6250     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6251     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6252     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6253     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6254   %}
6255   ins_pipe( pipe_slow );
6256 %}
6257 
6258 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
6259   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6260   match(Set dst (AddReductionVI src1 src2));
6261   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
6262   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
6263             "vpaddd  $tmp3,$tmp3,$src2\n\t"
6264             "vextracti128_high  $tmp,$tmp3\n\t"
6265             "vpaddd  $tmp,$tmp,$tmp3\n\t"
6266             "pshufd  $tmp2,$tmp,0xE\n\t"
6267             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6268             "pshufd  $tmp2,$tmp,0x1\n\t"
6269             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6270             "movd    $tmp2,$src1\n\t"
6271             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6272             "movd    $dst,$tmp2\t! mul reduction16I" %}
6273   ins_encode %{
6274     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
6275     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
6276     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
6277     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
6278     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6279     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6280     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6281     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6282     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6283     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6284     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6285   %}
6286   ins_pipe( pipe_slow );
6287 %}
6288 
6289 #ifdef _LP64
6290 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
6291   predicate(UseAVX > 2);
6292   match(Set dst (AddReductionVL src1 src2));
6293   effect(TEMP tmp, TEMP tmp2);
6294   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
6295             "vpaddq  $tmp,$src2,$tmp2\n\t"
6296             "movdq   $tmp2,$src1\n\t"
6297             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
6298             "movdq   $dst,$tmp2\t! add reduction2L" %}
6299   ins_encode %{
6300     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
6301     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
6302     __ movdq($tmp2$$XMMRegister, $src1$$Register);
6303     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6304     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6305   %}
6306   ins_pipe( pipe_slow );
6307 %}
6308 
6309 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
6310   predicate(UseAVX > 2);
6311   match(Set dst (AddReductionVL src1 src2));
6312   effect(TEMP tmp, TEMP tmp2);
6313   format %{ "vextracti128_high  $tmp,$src2\n\t"
6314             "vpaddq  $tmp2,$tmp,$src2\n\t"
6315             "pshufd  $tmp,$tmp2,0xE\n\t"
6316             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6317             "movdq   $tmp,$src1\n\t"
6318             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6319             "movdq   $dst,$tmp2\t! add reduction4L" %}
6320   ins_encode %{
6321     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6322     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
6323     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6324     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6325     __ movdq($tmp$$XMMRegister, $src1$$Register);
6326     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6327     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6328   %}
6329   ins_pipe( pipe_slow );
6330 %}
6331 
6332 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
6333   predicate(UseAVX > 2);
6334   match(Set dst (AddReductionVL src1 src2));
6335   effect(TEMP tmp, TEMP tmp2);
6336   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6337             "vpaddq  $tmp2,$tmp2,$src2\n\t"
6338             "vextracti128_high  $tmp,$tmp2\n\t"
6339             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6340             "pshufd  $tmp,$tmp2,0xE\n\t"
6341             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6342             "movdq   $tmp,$src1\n\t"
6343             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6344             "movdq   $dst,$tmp2\t! add reduction8L" %}
6345   ins_encode %{
6346     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6347     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6348     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6349     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6350     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6351     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6352     __ movdq($tmp$$XMMRegister, $src1$$Register);
6353     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6354     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6355   %}
6356   ins_pipe( pipe_slow );
6357 %}
6358 #endif
6359 
6360 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
6361   predicate(UseSSE >= 1 && UseAVX == 0);
6362   match(Set dst (AddReductionVF dst src2));
6363   effect(TEMP dst, TEMP tmp);
6364   format %{ "addss   $dst,$src2\n\t"
6365             "pshufd  $tmp,$src2,0x01\n\t"
6366             "addss   $dst,$tmp\t! add reduction2F" %}
6367   ins_encode %{
6368     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
6369     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6370     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6371   %}
6372   ins_pipe( pipe_slow );
6373 %}
6374 
6375 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
6376   predicate(UseAVX > 0);
6377   match(Set dst (AddReductionVF dst src2));
6378   effect(TEMP dst, TEMP tmp);
6379   format %{ "vaddss  $dst,$dst,$src2\n\t"
6380             "pshufd  $tmp,$src2,0x01\n\t"
6381             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
6382   ins_encode %{
6383     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6384     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6385     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6386   %}
6387   ins_pipe( pipe_slow );
6388 %}
6389 
6390 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
6391   predicate(UseSSE >= 1 && UseAVX == 0);
6392   match(Set dst (AddReductionVF dst src2));
6393   effect(TEMP dst, TEMP tmp);
6394   format %{ "addss   $dst,$src2\n\t"
6395             "pshufd  $tmp,$src2,0x01\n\t"
6396             "addss   $dst,$tmp\n\t"
6397             "pshufd  $tmp,$src2,0x02\n\t"
6398             "addss   $dst,$tmp\n\t"
6399             "pshufd  $tmp,$src2,0x03\n\t"
6400             "addss   $dst,$tmp\t! add reduction4F" %}
6401   ins_encode %{
6402     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
6403     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6404     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6405     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6406     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6407     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6408     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6409   %}
6410   ins_pipe( pipe_slow );
6411 %}
6412 
6413 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
6414   predicate(UseAVX > 0);
6415   match(Set dst (AddReductionVF dst src2));
6416   effect(TEMP tmp, TEMP dst);
6417   format %{ "vaddss  $dst,dst,$src2\n\t"
6418             "pshufd  $tmp,$src2,0x01\n\t"
6419             "vaddss  $dst,$dst,$tmp\n\t"
6420             "pshufd  $tmp,$src2,0x02\n\t"
6421             "vaddss  $dst,$dst,$tmp\n\t"
6422             "pshufd  $tmp,$src2,0x03\n\t"
6423             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
6424   ins_encode %{
6425     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6426     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6427     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6428     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6429     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6430     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6431     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6432   %}
6433   ins_pipe( pipe_slow );
6434 %}
6435 
6436 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
6437   predicate(UseAVX > 0);
6438   match(Set dst (AddReductionVF dst src2));
6439   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6440   format %{ "vaddss  $dst,$dst,$src2\n\t"
6441             "pshufd  $tmp,$src2,0x01\n\t"
6442             "vaddss  $dst,$dst,$tmp\n\t"
6443             "pshufd  $tmp,$src2,0x02\n\t"
6444             "vaddss  $dst,$dst,$tmp\n\t"
6445             "pshufd  $tmp,$src2,0x03\n\t"
6446             "vaddss  $dst,$dst,$tmp\n\t"
6447             "vextractf128_high  $tmp2,$src2\n\t"
6448             "vaddss  $dst,$dst,$tmp2\n\t"
6449             "pshufd  $tmp,$tmp2,0x01\n\t"
6450             "vaddss  $dst,$dst,$tmp\n\t"
6451             "pshufd  $tmp,$tmp2,0x02\n\t"
6452             "vaddss  $dst,$dst,$tmp\n\t"
6453             "pshufd  $tmp,$tmp2,0x03\n\t"
6454             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
6455   ins_encode %{
6456     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6457     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6458     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6459     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6460     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6461     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6462     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6463     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6464     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6465     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6466     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6467     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6468     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6469     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6470     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6471   %}
6472   ins_pipe( pipe_slow );
6473 %}
6474 
6475 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
6476   predicate(UseAVX > 2);
6477   match(Set dst (AddReductionVF dst src2));
6478   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6479   format %{ "vaddss  $dst,$dst,$src2\n\t"
6480             "pshufd  $tmp,$src2,0x01\n\t"
6481             "vaddss  $dst,$dst,$tmp\n\t"
6482             "pshufd  $tmp,$src2,0x02\n\t"
6483             "vaddss  $dst,$dst,$tmp\n\t"
6484             "pshufd  $tmp,$src2,0x03\n\t"
6485             "vaddss  $dst,$dst,$tmp\n\t"
6486             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6487             "vaddss  $dst,$dst,$tmp2\n\t"
6488             "pshufd  $tmp,$tmp2,0x01\n\t"
6489             "vaddss  $dst,$dst,$tmp\n\t"
6490             "pshufd  $tmp,$tmp2,0x02\n\t"
6491             "vaddss  $dst,$dst,$tmp\n\t"
6492             "pshufd  $tmp,$tmp2,0x03\n\t"
6493             "vaddss  $dst,$dst,$tmp\n\t"
6494             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6495             "vaddss  $dst,$dst,$tmp2\n\t"
6496             "pshufd  $tmp,$tmp2,0x01\n\t"
6497             "vaddss  $dst,$dst,$tmp\n\t"
6498             "pshufd  $tmp,$tmp2,0x02\n\t"
6499             "vaddss  $dst,$dst,$tmp\n\t"
6500             "pshufd  $tmp,$tmp2,0x03\n\t"
6501             "vaddss  $dst,$dst,$tmp\n\t"
6502             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6503             "vaddss  $dst,$dst,$tmp2\n\t"
6504             "pshufd  $tmp,$tmp2,0x01\n\t"
6505             "vaddss  $dst,$dst,$tmp\n\t"
6506             "pshufd  $tmp,$tmp2,0x02\n\t"
6507             "vaddss  $dst,$dst,$tmp\n\t"
6508             "pshufd  $tmp,$tmp2,0x03\n\t"
6509             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
6510   ins_encode %{
6511     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6512     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6513     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6514     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6515     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6516     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6517     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6518     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6519     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6520     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6521     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6522     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6523     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6524     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6525     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6526     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6527     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6528     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6529     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6530     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6531     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6532     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6533     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6534     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6535     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6536     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6537     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6538     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6539     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6540     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6541     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6542   %}
6543   ins_pipe( pipe_slow );
6544 %}
6545 
6546 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
6547   predicate(UseSSE >= 1 && UseAVX == 0);
6548   match(Set dst (AddReductionVD dst src2));
6549   effect(TEMP tmp, TEMP dst);
6550   format %{ "addsd   $dst,$src2\n\t"
6551             "pshufd  $tmp,$src2,0xE\n\t"
6552             "addsd   $dst,$tmp\t! add reduction2D" %}
6553   ins_encode %{
6554     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
6555     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6556     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
6557   %}
6558   ins_pipe( pipe_slow );
6559 %}
6560 
6561 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
6562   predicate(UseAVX > 0);
6563   match(Set dst (AddReductionVD dst src2));
6564   effect(TEMP tmp, TEMP dst);
6565   format %{ "vaddsd  $dst,$dst,$src2\n\t"
6566             "pshufd  $tmp,$src2,0xE\n\t"
6567             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
6568   ins_encode %{
6569     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6570     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6571     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6572   %}
6573   ins_pipe( pipe_slow );
6574 %}
6575 
6576 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
6577   predicate(UseAVX > 0);
6578   match(Set dst (AddReductionVD dst src2));
6579   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6580   format %{ "vaddsd  $dst,$dst,$src2\n\t"
6581             "pshufd  $tmp,$src2,0xE\n\t"
6582             "vaddsd  $dst,$dst,$tmp\n\t"
6583             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6584             "vaddsd  $dst,$dst,$tmp2\n\t"
6585             "pshufd  $tmp,$tmp2,0xE\n\t"
6586             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
6587   ins_encode %{
6588     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6589     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6590     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6591     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6592     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6593     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6594     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6595   %}
6596   ins_pipe( pipe_slow );
6597 %}
6598 
6599 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
6600   predicate(UseAVX > 2);
6601   match(Set dst (AddReductionVD dst src2));
6602   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6603   format %{ "vaddsd  $dst,$dst,$src2\n\t"
6604             "pshufd  $tmp,$src2,0xE\n\t"
6605             "vaddsd  $dst,$dst,$tmp\n\t"
6606             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6607             "vaddsd  $dst,$dst,$tmp2\n\t"
6608             "pshufd  $tmp,$tmp2,0xE\n\t"
6609             "vaddsd  $dst,$dst,$tmp\n\t"
6610             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6611             "vaddsd  $dst,$dst,$tmp2\n\t"
6612             "pshufd  $tmp,$tmp2,0xE\n\t"
6613             "vaddsd  $dst,$dst,$tmp\n\t"
6614             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6615             "vaddsd  $dst,$dst,$tmp2\n\t"
6616             "pshufd  $tmp,$tmp2,0xE\n\t"
6617             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
6618   ins_encode %{
6619     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6620     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6621     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6622     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6623     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6624     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6625     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6626     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6627     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6628     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6629     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6630     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6631     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6632     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6633     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6634   %}
6635   ins_pipe( pipe_slow );
6636 %}
6637 
6638 instruct rssub2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
6639   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6640   match(Set dst (SubReductionVFP dst src2));
6641   effect(TEMP dst, TEMP tmp);
6642   format %{ "subss  $dst,$src2\n\t"
6643             "pshufd  $tmp,$src2,0x01\n\t"
6644             "subss  $dst,$dst,$tmp\t! sub reduction2F" %}
6645   ins_encode %{
6646     __ subss($dst$$XMMRegister, $src2$$XMMRegister);
6647     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6648     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
6649   %}
6650   ins_pipe( pipe_slow );
6651 %}
6652 
6653 instruct rvsub2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
6654   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6655   match(Set dst (SubReductionVFP dst src2));
6656   effect(TEMP dst, TEMP tmp);
6657   format %{ "vsubss  $dst,$dst,$src2\n\t"
6658             "pshufd  $tmp,$src2,0x01\n\t"
6659             "vsubss  $dst,$dst,$tmp\t! sub reduction2F" %}
6660   ins_encode %{
6661     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6662     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6663     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6664   %}
6665   ins_pipe( pipe_slow );
6666 %}
6667 
6668 instruct rssub4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
6669   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6670   match(Set dst (SubReductionVFP dst src2));
6671   effect(TEMP dst, TEMP tmp);
6672   format %{ "subss   $dst,$src2\n\t"
6673             "pshufd  $tmp,$src2,0x01\n\t"
6674             "subss   $dst,$tmp\n\t"
6675             "pshufd  $tmp,$src2,0x02\n\t"
6676             "subss   $dst,$tmp\n\t"
6677             "pshufd  $tmp,$src2,0x03\n\t"
6678             "subss   $dst,$tmp\t! sub reduction4F" %}
6679   ins_encode %{
6680     __ subss($dst$$XMMRegister, $src2$$XMMRegister);
6681     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6682     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
6683     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6684     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
6685     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6686     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
6687   %}
6688   ins_pipe( pipe_slow );
6689 %}
6690 
6691 instruct rvsub4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
6692   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6693   match(Set dst (SubReductionVFP dst src2));
6694   effect(TEMP tmp, TEMP dst);
6695   format %{ "vsubss  $dst,dst,$src2\n\t"
6696             "pshufd  $tmp,$src2,0x01\n\t"
6697             "vsubss  $dst,$dst,$tmp\n\t"
6698             "pshufd  $tmp,$src2,0x02\n\t"
6699             "vsubss  $dst,$dst,$tmp\n\t"
6700             "pshufd  $tmp,$src2,0x03\n\t"
6701             "vsubss  $dst,$dst,$tmp\t! sub reduction4F" %}
6702   ins_encode %{
6703     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6704     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6705     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6706     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6707     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6708     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6709     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6710   %}
6711   ins_pipe( pipe_slow );
6712 %}
6713 
6714 instruct rsub8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
6715   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6716   match(Set dst (SubReductionVFP dst src2));
6717   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6718   format %{ "vsubss  $dst,$dst,$src2\n\t"
6719             "pshufd  $tmp,$src2,0x01\n\t"
6720             "vsubss  $dst,$dst,$tmp\n\t"
6721             "pshufd  $tmp,$src2,0x02\n\t"
6722             "vsubss  $dst,$dst,$tmp\n\t"
6723             "pshufd  $tmp,$src2,0x03\n\t"
6724             "vsubss  $dst,$dst,$tmp\n\t"
6725             "vextractf128_high  $tmp2,$src2\n\t"
6726             "vsubss  $dst,$dst,$tmp2\n\t"
6727             "pshufd  $tmp,$tmp2,0x01\n\t"
6728             "vsubss  $dst,$dst,$tmp\n\t"
6729             "pshufd  $tmp,$tmp2,0x02\n\t"
6730             "vsubss  $dst,$dst,$tmp\n\t"
6731             "pshufd  $tmp,$tmp2,0x03\n\t"
6732             "vsubss  $dst,$dst,$tmp\t! sub reduction8F" %}
6733   ins_encode %{
6734     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6735     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6736     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6737     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6738     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6739     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6740     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6741     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6742     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6743     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6744     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6745     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6746     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6747     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6748     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6749   %}
6750   ins_pipe( pipe_slow );
6751 %}
6752 
6753 instruct rsub16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
6754   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6755   match(Set dst (SubReductionVFP dst src2));
6756   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6757   format %{ "vsubss  $dst,$dst,$src2\n\t"
6758             "pshufd  $tmp,$src2,0x01\n\t"
6759             "vsubss  $dst,$dst,$tmp\n\t"
6760             "pshufd  $tmp,$src2,0x02\n\t"
6761             "vsubss  $dst,$dst,$tmp\n\t"
6762             "pshufd  $tmp,$src2,0x03\n\t"
6763             "vsubss  $dst,$dst,$tmp\n\t"
6764             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6765             "vsubss  $dst,$dst,$tmp2\n\t"
6766             "pshufd  $tmp,$tmp2,0x01\n\t"
6767             "vsubss  $dst,$dst,$tmp\n\t"
6768             "pshufd  $tmp,$tmp2,0x02\n\t"
6769             "vsubss  $dst,$dst,$tmp\n\t"
6770             "pshufd  $tmp,$tmp2,0x03\n\t"
6771             "vsubss  $dst,$dst,$tmp\n\t"
6772             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6773             "vsubss  $dst,$dst,$tmp2\n\t"
6774             "pshufd  $tmp,$tmp2,0x01\n\t"
6775             "vsubss  $dst,$dst,$tmp\n\t"
6776             "pshufd  $tmp,$tmp2,0x02\n\t"
6777             "vsubss  $dst,$dst,$tmp\n\t"
6778             "pshufd  $tmp,$tmp2,0x03\n\t"
6779             "vsubss  $dst,$dst,$tmp\n\t"
6780             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6781             "vsubss  $dst,$dst,$tmp2\n\t"
6782             "pshufd  $tmp,$tmp2,0x01\n\t"
6783             "vsubss  $dst,$dst,$tmp\n\t"
6784             "pshufd  $tmp,$tmp2,0x02\n\t"
6785             "vsubss  $dst,$dst,$tmp\n\t"
6786             "pshufd  $tmp,$tmp2,0x03\n\t"
6787             "vsubss  $dst,$dst,$tmp\t! sub reduction16F" %}
6788   ins_encode %{
6789     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6790     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6791     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6792     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6793     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6794     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6795     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6796     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6797     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6798     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6799     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6800     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6801     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6802     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6803     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6804     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6805     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6806     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6807     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6808     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6809     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6810     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6811     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6812     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6813     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6814     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6815     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6816     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6817     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6818     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6819     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6820   %}
6821   ins_pipe( pipe_slow );
6822 %}
6823 
6824 instruct rssub2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
6825   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6826   match(Set dst (SubReductionVFP dst src2));
6827   effect(TEMP tmp, TEMP dst);
6828   format %{ "subsd   $dst,$src2\n\t"
6829             "pshufd  $tmp,$src2,0xE\n\t"
6830             "subsd   $dst,$tmp\t! sub reduction2D" %}
6831   ins_encode %{
6832     __ subsd($dst$$XMMRegister, $src2$$XMMRegister);
6833     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6834     __ subsd($dst$$XMMRegister, $tmp$$XMMRegister);
6835   %}
6836   ins_pipe( pipe_slow );
6837 %}
6838 
6839 instruct rvsub2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
6840   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6841   match(Set dst (SubReductionVFP dst src2));
6842   effect(TEMP tmp, TEMP dst);
6843   format %{ "vsubsd  $dst,$dst,$src2\n\t"
6844             "pshufd  $tmp,$src2,0xE\n\t"
6845             "vsubsd  $dst,$dst,$tmp\t! sub reduction2D" %}
6846   ins_encode %{
6847     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6848     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6849     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6850   %}
6851   ins_pipe( pipe_slow );
6852 %}
6853 
6854 instruct rvsub4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
6855   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6856   match(Set dst (SubReductionVFP dst src2));
6857   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6858   format %{ "vsubsd  $dst,$dst,$src2\n\t"
6859             "pshufd  $tmp,$src2,0xE\n\t"
6860             "vsubsd  $dst,$dst,$tmp\n\t"
6861             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6862             "vsubsd  $dst,$dst,$tmp2\n\t"
6863             "pshufd  $tmp,$tmp2,0xE\n\t"
6864             "vsubsd  $dst,$dst,$tmp\t! sub reduction4D" %}
6865   ins_encode %{
6866     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6867     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6868     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6869     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6870     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6871     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6872     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6873   %}
6874   ins_pipe( pipe_slow );
6875 %}
6876 
6877 instruct rvsub8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
6878   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6879   match(Set dst (SubReductionVFP dst src2));
6880   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6881   format %{ "vsubsd  $dst,$dst,$src2\n\t"
6882             "pshufd  $tmp,$src2,0xE\n\t"
6883             "vsubsd  $dst,$dst,$tmp\n\t"
6884             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6885             "vsubsd  $dst,$dst,$tmp2\n\t"
6886             "pshufd  $tmp,$tmp2,0xE\n\t"
6887             "vsubsd  $dst,$dst,$tmp\n\t"
6888             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6889             "vsubsd  $dst,$dst,$tmp2\n\t"
6890             "pshufd  $tmp,$tmp2,0xE\n\t"
6891             "vsubsd  $dst,$dst,$tmp\n\t"
6892             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6893             "vsubsd  $dst,$dst,$tmp2\n\t"
6894             "pshufd  $tmp,$tmp2,0xE\n\t"
6895             "vsubsd  $dst,$dst,$tmp\t! sub reduction8D" %}
6896   ins_encode %{
6897     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6898     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6899     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6900     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6901     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6902     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6903     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6904     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6905     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6906     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6907     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6908     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6909     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6910     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6911     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6912   %}
6913   ins_pipe( pipe_slow );
6914 %}
6915 
6916 instruct rsmul8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
6917   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6918   match(Set dst (MulReductionVI src1 src2));
6919   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
6920   format %{ "pmovsxbw $tmp,$src2\n\t"
6921             "pshufd   $tmp1,$tmp,0xE\n\t"
6922             "pmullw   $tmp,$tmp1\n\t"
6923             "pshufd   $tmp1,$tmp,0x1\n\t"
6924             "pmullw   $tmp,$tmp1\n\t"
6925             "pextrw   $tmp2,$tmp, 0x1\n\t"
6926             "pextrw   $tmp3,$tmp, 0x0\n\t"
6927             "imul     $tmp2,$tmp3 \n\t"
6928             "movsbl   $dst,$src1\n\t"
6929             "imull    $dst,$tmp2\n\t"
6930             "movsbl   $dst,$dst\t! mul reduction8B" %}
6931   ins_encode %{
6932     __ pmovsxbw($tmp$$XMMRegister, $src2$$XMMRegister);
6933     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
6934     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
6935     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
6936     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
6937     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
6938     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
6939     __ imull($tmp2$$Register, $tmp3$$Register);
6940     __ movsbl($dst$$Register, $src1$$Register);
6941     __ imull($dst$$Register, $tmp2$$Register);
6942     __ movsbl($dst$$Register, $dst$$Register);
6943   %}
6944   ins_pipe( pipe_slow );
6945 %}
6946 
6947 instruct rsmul16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
6948   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6949   match(Set dst (MulReductionVI src1 src2));
6950   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
6951   format %{ "pmovsxbw $tmp,$src2\n\t"
6952             "pshufd   $tmp1,$src2,0xEE\n\t"
6953             "pmovsxbw $tmp1,$tmp1\n\t"
6954             "pmullw   $tmp,$tmp1\n\t"
6955             "pshufd   $tmp1,$tmp,0xE\n\t"
6956             "pmullw   $tmp,$tmp1\n\t"
6957             "pshufd   $tmp1,$tmp,0x1\n\t"
6958             "pmullw   $tmp,$tmp1\n\t"
6959             "pextrw   $tmp2,$tmp, 0x1\n\t"
6960             "pextrw   $tmp3,$tmp, 0x0\n\t"
6961             "imull    $tmp2,$tmp3 \n\t"
6962             "movsbl   $dst,$src1\n\t"
6963             "imull    $dst,$tmp2\n\t"
6964             "movsbl   $dst,$dst\t! mul reduction16B" %}
6965   ins_encode %{
6966     int vector_len = 0;
6967     __ pmovsxbw($tmp$$XMMRegister, $src2$$XMMRegister);
6968     __ pshufd($tmp1$$XMMRegister, $src2$$XMMRegister, 0xEE);
6969     __ pmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister);
6970     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
6971     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
6972     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
6973     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
6974     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
6975     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
6976     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
6977     __ imull($tmp2$$Register, $tmp3$$Register);
6978     __ movsbl($dst$$Register, $src1$$Register);
6979     __ imull($dst$$Register, $tmp2$$Register);
6980     __ movsbl($dst$$Register, $dst$$Register);
6981   %}
6982   ins_pipe( pipe_slow );
6983 %}
6984 
6985 instruct rvmul32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
6986   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6987   match(Set dst (MulReductionVI src1 src2));
6988   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
6989   format %{ "vextracti128_high  $tmp,$src2\n\t"
6990             "pmovsxbw $tmp,$tmp\n\t"
6991             "pmovsxbw $tmp1,$src2\n\t"
6992             "vpmullw  $tmp,$tmp,$tmp1\n\t"
6993             "vextracti128_high  $tmp1,$tmp\n\t"
6994             "vpmullw  $tmp,$tmp,$tmp1\n\t"
6995             "pshufd   $tmp1,$tmp,0xE\n\t"
6996             "vpmullw  $tmp,$tmp,$tmp1\n\t"
6997             "pshufd   $tmp1,$tmp,0x1\n\t"
6998             "vpmullw  $tmp,$tmp,$tmp1\n\t"
6999             "pextrw   $tmp2,$tmp, 0x1\n\t"
7000             "pextrw   $tmp3,$tmp, 0x0\n\t"
7001             "imull    $tmp2,$tmp3 \n\t"
7002             "movsbl   $dst,$src1\n\t"
7003             "imull    $dst,$tmp2\n\t"
7004             "movsbl   $dst,$dst\t! mul reduction32B" %}
7005   ins_encode %{
7006     int vector_len = 1;
7007     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7008     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
7009     __ vpmovsxbw($tmp1$$XMMRegister, $src2$$XMMRegister, vector_len);
7010     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7011     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7012     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7013     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7014     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7015     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7016     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7017     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7018     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7019     __ imull($tmp2$$Register, $tmp3$$Register);
7020     __ movsbl($dst$$Register, $src1$$Register);
7021     __ imull($dst$$Register, $tmp2$$Register);
7022     __ movsbl($dst$$Register, $dst$$Register);
7023   %}
7024   ins_pipe( pipe_slow );
7025 %}
7026 
7027 instruct rvmul64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7028   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7029   match(Set dst (MulReductionVI src1 src2));
7030   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7031   format %{ "vextracti64x4_high  $tmp,$src2\n\t"
7032             "vpmovsxbw $tmp,$tmp\n\t"
7033             "vpmovsxbw $tmp1,$src2\n\t"
7034             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7035             "vextracti64x4_high  $tmp1,$tmp\n\t"
7036             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7037             "vextracti128_high  $tmp1,$tmp\n\t"
7038             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7039             "pshufd   $tmp1,$tmp,0xE\n\t"
7040             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7041             "pshufd   $tmp1,$tmp,0x1\n\t"
7042             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7043             "pextrw   $tmp2,$tmp, 0x1\n\t"
7044             "pextrw   $tmp3,$tmp, 0x0\n\t"
7045             "imull    $tmp2,$tmp3 \n\t"
7046             "movsbl   $dst,$src1\n\t"
7047             "imull    $dst,$tmp2\n\t"
7048             "movsbl   $dst,$dst\t! mul reduction64B" %}
7049   ins_encode %{
7050     int vector_len = 2;
7051     __ vextracti64x4_high($tmp$$XMMRegister, $src2$$XMMRegister);
7052     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
7053     __ vpmovsxbw($tmp1$$XMMRegister, $src2$$XMMRegister, vector_len);
7054     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7055     __ vextracti64x4_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7056     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 1);
7057     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7058     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7059     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7060     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7061     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7062     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7063     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7064     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7065     __ imull($tmp2$$Register, $tmp3$$Register);
7066     __ movsbl($dst$$Register, $src1$$Register);
7067     __ imull($dst$$Register, $tmp2$$Register);
7068     __ movsbl($dst$$Register, $dst$$Register);
7069   %}
7070   ins_pipe( pipe_slow );
7071 %}
7072 
7073 instruct rsmul4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
7074   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7075   match(Set dst (MulReductionVI src1 src2));
7076   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3);
7077   format %{ "pshufd  $tmp,$src2,0x1\n\t"
7078             "pmullw  $tmp,$src2\n\t"
7079             "pextrw  $tmp2,$tmp, 0x1\n\t"
7080             "pextrw  $tmp3,$tmp, 0x0\n\t"
7081             "imull    $tmp2,$tmp3 \n\t"
7082             "movswl   $dst,$src1\n\t"
7083             "imull    $dst,$tmp2\n\t"
7084             "movswl   $dst,$dst\t! mul reduction4S" %}
7085   ins_encode %{
7086     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
7087     __ pmullw($tmp$$XMMRegister, $src2$$XMMRegister);
7088     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7089     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7090     __ imull($tmp2$$Register, $tmp3$$Register);
7091     __ movswl($dst$$Register, $src1$$Register);
7092     __ imull($dst$$Register, $tmp2$$Register);
7093     __ movswl($dst$$Register, $dst$$Register);
7094   %}
7095   ins_pipe( pipe_slow );
7096 %}
7097 
7098 instruct rsmul8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7099   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7100   match(Set dst (MulReductionVI src1 src2));
7101   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7102   format %{ "pshufd  $tmp,$src2,0xE\n\t"
7103             "pmullw  $tmp,$src2\n\t"
7104             "pshufd  $tmp1,$tmp,0x1\n\t"
7105             "pmullw  $tmp,$tmp1\n\t"
7106             "pextrw  $tmp2,$tmp, 0x1\n\t"
7107             "pextrw  $tmp3,$tmp, 0x0\n\t"
7108             "imul    $tmp2,$tmp3 \n\t"
7109             "movswl   $dst,$src1\n\t"
7110             "imull    $dst,$tmp2\n\t"
7111             "movswl   $dst,$dst\t! mul reduction8S" %}
7112   ins_encode %{
7113     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7114     __ pmullw($tmp$$XMMRegister, $src2$$XMMRegister);
7115     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7116     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7117     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7118     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7119     __ imull($tmp2$$Register, $tmp3$$Register);
7120     __ movswl($dst$$Register, $src1$$Register);
7121     __ imull($dst$$Register, $tmp2$$Register);
7122     __ movswl($dst$$Register, $dst$$Register);
7123   %}
7124   ins_pipe( pipe_slow );
7125 %}
7126 
7127 instruct rvmul16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7128   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7129   match(Set dst (MulReductionVI src1 src2));
7130   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7131   format %{ "vextracti128_high  $tmp,$src2\n\t"
7132             "vpmullw  $tmp,$tmp,$src2\n\t"
7133             "pshufd  $tmp1,$tmp,0xE\n\t"
7134             "pmullw  $tmp,$tmp1\n\t"
7135             "pshufd  $tmp1,$tmp,0x1\n\t"
7136             "pmullw  $tmp,$tmp1\n\t"
7137             "pextrw  $tmp2,$tmp, 0x1\n\t"
7138             "pextrw  $tmp3,$tmp, 0x0\n\t"
7139             "imul    $tmp2,$tmp3 \n\t"
7140             "movswl   $dst,$src1\n\t"
7141             "imull    $dst,$tmp2\n\t"
7142             "movswl   $dst,$dst\t! mul reduction16S" %}
7143   ins_encode %{
7144     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7145     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 1);
7146     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7147     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7148     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7149     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7150     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7151     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7152     __ imull($tmp2$$Register, $tmp3$$Register);
7153     __ movswl($dst$$Register, $src1$$Register);
7154     __ imull($dst$$Register, $tmp2$$Register);
7155     __ movswl($dst$$Register, $dst$$Register);
7156   %}
7157   ins_pipe( pipe_slow );
7158 %}
7159 
7160 instruct rvmul32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7161   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7162   match(Set dst (MulReductionVI src1 src2));
7163   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7164   format %{ "vextracti64x4_high  $tmp1,$src2\n\t"
7165             "vpmullw  $tmp1,$tmp1,$src2\n\t"
7166             "vextracti128_high  $tmp,$tmp1\n\t"
7167             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7168             "pshufd  $tmp1,$tmp,0xE\n\t"
7169             "pmullw  $tmp,$tmp1\n\t"
7170             "pshufd  $tmp1,$tmp,0x1\n\t"
7171             "pmullw  $tmp,$tmp1\n\t"
7172             "pextrw  $tmp2,$tmp, 0x1\n\t"
7173             "pextrw  $tmp3,$tmp, 0x0\n\t"
7174             "imul    $tmp2,$tmp3 \n\t"
7175             "movswl   $dst,$src1\n\t"
7176             "imull    $dst,$tmp2\n\t"
7177             "movswl   $dst,$dst\t! mul reduction32S" %}
7178   ins_encode %{
7179     int vector_len = 0;
7180     __ vextracti64x4_high($tmp1$$XMMRegister, $src2$$XMMRegister);
7181     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $src2$$XMMRegister, 1);
7182     __ vextracti128_high($tmp$$XMMRegister, $tmp1$$XMMRegister);
7183     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7184     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7185     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7186     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7187     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7188     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7189     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7190     __ imull($tmp2$$Register, $tmp3$$Register);
7191     __ movswl($dst$$Register, $src1$$Register);
7192     __ imull($dst$$Register, $tmp2$$Register);
7193     __ movswl($dst$$Register, $dst$$Register);
7194   %}
7195   ins_pipe( pipe_slow );
7196 %}
7197 
7198 
7199 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
7200   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7201   match(Set dst (MulReductionVI src1 src2));
7202   effect(TEMP tmp, TEMP tmp2);
7203   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
7204             "pmulld  $tmp2,$src2\n\t"
7205             "movd    $tmp,$src1\n\t"
7206             "pmulld  $tmp2,$tmp\n\t"
7207             "movd    $dst,$tmp2\t! mul reduction2I" %}
7208   ins_encode %{
7209     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7210     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
7211     __ movdl($tmp$$XMMRegister, $src1$$Register);
7212     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7213     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7214   %}
7215   ins_pipe( pipe_slow );
7216 %}
7217 
7218 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
7219   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7220   match(Set dst (MulReductionVI src1 src2));
7221   effect(TEMP tmp, TEMP tmp2);
7222   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
7223             "vpmulld  $tmp,$src2,$tmp2\n\t"
7224             "movd     $tmp2,$src1\n\t"
7225             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7226             "movd     $dst,$tmp2\t! mul reduction2I" %}
7227   ins_encode %{
7228     int vector_len = 0;
7229     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7230     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7231     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7232     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7233     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7234   %}
7235   ins_pipe( pipe_slow );
7236 %}
7237 
7238 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
7239   predicate(UseSSE > 3 && UseAVX == 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7240   match(Set dst (MulReductionVI src1 src2));
7241   effect(TEMP tmp, TEMP tmp2);
7242   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
7243             "pmulld  $tmp2,$src2\n\t"
7244             "pshufd  $tmp,$tmp2,0x1\n\t"
7245             "pmulld  $tmp2,$tmp\n\t"
7246             "movd    $tmp,$src1\n\t"
7247             "pmulld  $tmp2,$tmp\n\t"
7248             "movd    $dst,$tmp2\t! mul reduction4I" %}
7249   ins_encode %{
7250     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7251     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
7252     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
7253     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7254     __ movdl($tmp$$XMMRegister, $src1$$Register);
7255     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7256     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7257   %}
7258   ins_pipe( pipe_slow );
7259 %}
7260 
7261 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
7262   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7263   match(Set dst (MulReductionVI src1 src2));
7264   effect(TEMP tmp, TEMP tmp2);
7265   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
7266             "vpmulld  $tmp,$src2,$tmp2\n\t"
7267             "pshufd   $tmp2,$tmp,0x1\n\t"
7268             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7269             "movd     $tmp2,$src1\n\t"
7270             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7271             "movd     $dst,$tmp2\t! mul reduction4I" %}
7272   ins_encode %{
7273     int vector_len = 0;
7274     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7275     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7276     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7277     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7278     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7279     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7280     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7281   %}
7282   ins_pipe( pipe_slow );
7283 %}
7284 
7285 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
7286   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7287   match(Set dst (MulReductionVI src1 src2));
7288   effect(TEMP tmp, TEMP tmp2);
7289   format %{ "vextracti128_high  $tmp,$src2\n\t"
7290             "vpmulld  $tmp,$tmp,$src2\n\t"
7291             "pshufd   $tmp2,$tmp,0xE\n\t"
7292             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7293             "pshufd   $tmp2,$tmp,0x1\n\t"
7294             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7295             "movd     $tmp2,$src1\n\t"
7296             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7297             "movd     $dst,$tmp2\t! mul reduction8I" %}
7298   ins_encode %{
7299     int vector_len = 0;
7300     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7301     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
7302     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
7303     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7304     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7305     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7306     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7307     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7308     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7309   %}
7310   ins_pipe( pipe_slow );
7311 %}
7312 
7313 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
7314   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7315   match(Set dst (MulReductionVI src1 src2));
7316   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
7317   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
7318             "vpmulld  $tmp3,$tmp3,$src2\n\t"
7319             "vextracti128_high  $tmp,$tmp3\n\t"
7320             "vpmulld  $tmp,$tmp,$src2\n\t"
7321             "pshufd   $tmp2,$tmp,0xE\n\t"
7322             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7323             "pshufd   $tmp2,$tmp,0x1\n\t"
7324             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7325             "movd     $tmp2,$src1\n\t"
7326             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7327             "movd     $dst,$tmp2\t! mul reduction16I" %}
7328   ins_encode %{
7329     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
7330     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
7331     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
7332     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
7333     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
7334     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7335     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7336     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7337     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7338     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7339     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7340   %}
7341   ins_pipe( pipe_slow );
7342 %}
7343 
7344 #ifdef _LP64
7345 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
7346   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7347   match(Set dst (MulReductionVL src1 src2));
7348   effect(TEMP tmp, TEMP tmp2);
7349   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
7350             "vpmullq  $tmp,$src2,$tmp2\n\t"
7351             "movdq    $tmp2,$src1\n\t"
7352             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
7353             "movdq    $dst,$tmp2\t! mul reduction2L" %}
7354   ins_encode %{
7355     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7356     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
7357     __ movdq($tmp2$$XMMRegister, $src1$$Register);
7358     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7359     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7360   %}
7361   ins_pipe( pipe_slow );
7362 %}
7363 
7364 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
7365   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7366   match(Set dst (MulReductionVL src1 src2));
7367   effect(TEMP tmp, TEMP tmp2);
7368   format %{ "vextracti128_high  $tmp,$src2\n\t"
7369             "vpmullq  $tmp2,$tmp,$src2\n\t"
7370             "pshufd   $tmp,$tmp2,0xE\n\t"
7371             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7372             "movdq    $tmp,$src1\n\t"
7373             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7374             "movdq    $dst,$tmp2\t! mul reduction4L" %}
7375   ins_encode %{
7376     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7377     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
7378     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7379     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7380     __ movdq($tmp$$XMMRegister, $src1$$Register);
7381     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7382     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7383   %}
7384   ins_pipe( pipe_slow );
7385 %}
7386 
7387 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
7388   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7389   match(Set dst (MulReductionVL src1 src2));
7390   effect(TEMP tmp, TEMP tmp2);
7391   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
7392             "vpmullq  $tmp2,$tmp2,$src2\n\t"
7393             "vextracti128_high  $tmp,$tmp2\n\t"
7394             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7395             "pshufd   $tmp,$tmp2,0xE\n\t"
7396             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7397             "movdq    $tmp,$src1\n\t"
7398             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7399             "movdq    $dst,$tmp2\t! mul reduction8L" %}
7400   ins_encode %{
7401     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7402     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
7403     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
7404     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7405     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7406     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7407     __ movdq($tmp$$XMMRegister, $src1$$Register);
7408     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7409     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7410   %}
7411   ins_pipe( pipe_slow );
7412 %}
7413 #endif
7414 
7415 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
7416   predicate(UseSSE >= 1 && UseAVX == 0);
7417   match(Set dst (MulReductionVF dst src2));
7418   effect(TEMP dst, TEMP tmp);
7419   format %{ "mulss   $dst,$src2\n\t"
7420             "pshufd  $tmp,$src2,0x01\n\t"
7421             "mulss   $dst,$tmp\t! mul reduction2F" %}
7422   ins_encode %{
7423     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
7424     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7425     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7426   %}
7427   ins_pipe( pipe_slow );
7428 %}
7429 
7430 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
7431   predicate(UseAVX > 0);
7432   match(Set dst (MulReductionVF dst src2));
7433   effect(TEMP tmp, TEMP dst);
7434   format %{ "vmulss  $dst,$dst,$src2\n\t"
7435             "pshufd  $tmp,$src2,0x01\n\t"
7436             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
7437   ins_encode %{
7438     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7439     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7440     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7441   %}
7442   ins_pipe( pipe_slow );
7443 %}
7444 
7445 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
7446   predicate(UseSSE >= 1 && UseAVX == 0);
7447   match(Set dst (MulReductionVF dst src2));
7448   effect(TEMP dst, TEMP tmp);
7449   format %{ "mulss   $dst,$src2\n\t"
7450             "pshufd  $tmp,$src2,0x01\n\t"
7451             "mulss   $dst,$tmp\n\t"
7452             "pshufd  $tmp,$src2,0x02\n\t"
7453             "mulss   $dst,$tmp\n\t"
7454             "pshufd  $tmp,$src2,0x03\n\t"
7455             "mulss   $dst,$tmp\t! mul reduction4F" %}
7456   ins_encode %{
7457     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
7458     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7459     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7460     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7461     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7462     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7463     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7464   %}
7465   ins_pipe( pipe_slow );
7466 %}
7467 
7468 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
7469   predicate(UseAVX > 0);
7470   match(Set dst (MulReductionVF dst src2));
7471   effect(TEMP tmp, TEMP dst);
7472   format %{ "vmulss  $dst,$dst,$src2\n\t"
7473             "pshufd  $tmp,$src2,0x01\n\t"
7474             "vmulss  $dst,$dst,$tmp\n\t"
7475             "pshufd  $tmp,$src2,0x02\n\t"
7476             "vmulss  $dst,$dst,$tmp\n\t"
7477             "pshufd  $tmp,$src2,0x03\n\t"
7478             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
7479   ins_encode %{
7480     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7481     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7482     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7483     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7484     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7485     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7486     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7487   %}
7488   ins_pipe( pipe_slow );
7489 %}
7490 
7491 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
7492   predicate(UseAVX > 0);
7493   match(Set dst (MulReductionVF dst src2));
7494   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7495   format %{ "vmulss  $dst,$dst,$src2\n\t"
7496             "pshufd  $tmp,$src2,0x01\n\t"
7497             "vmulss  $dst,$dst,$tmp\n\t"
7498             "pshufd  $tmp,$src2,0x02\n\t"
7499             "vmulss  $dst,$dst,$tmp\n\t"
7500             "pshufd  $tmp,$src2,0x03\n\t"
7501             "vmulss  $dst,$dst,$tmp\n\t"
7502             "vextractf128_high  $tmp2,$src2\n\t"
7503             "vmulss  $dst,$dst,$tmp2\n\t"
7504             "pshufd  $tmp,$tmp2,0x01\n\t"
7505             "vmulss  $dst,$dst,$tmp\n\t"
7506             "pshufd  $tmp,$tmp2,0x02\n\t"
7507             "vmulss  $dst,$dst,$tmp\n\t"
7508             "pshufd  $tmp,$tmp2,0x03\n\t"
7509             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
7510   ins_encode %{
7511     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7512     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7513     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7514     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7515     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7516     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7517     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7518     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7519     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7520     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7521     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7522     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7523     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7524     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7525     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7526   %}
7527   ins_pipe( pipe_slow );
7528 %}
7529 
7530 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
7531   predicate(UseAVX > 2);
7532   match(Set dst (MulReductionVF dst src2));
7533   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7534   format %{ "vmulss  $dst,$dst,$src2\n\t"
7535             "pshufd  $tmp,$src2,0x01\n\t"
7536             "vmulss  $dst,$dst,$tmp\n\t"
7537             "pshufd  $tmp,$src2,0x02\n\t"
7538             "vmulss  $dst,$dst,$tmp\n\t"
7539             "pshufd  $tmp,$src2,0x03\n\t"
7540             "vmulss  $dst,$dst,$tmp\n\t"
7541             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7542             "vmulss  $dst,$dst,$tmp2\n\t"
7543             "pshufd  $tmp,$tmp2,0x01\n\t"
7544             "vmulss  $dst,$dst,$tmp\n\t"
7545             "pshufd  $tmp,$tmp2,0x02\n\t"
7546             "vmulss  $dst,$dst,$tmp\n\t"
7547             "pshufd  $tmp,$tmp2,0x03\n\t"
7548             "vmulss  $dst,$dst,$tmp\n\t"
7549             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7550             "vmulss  $dst,$dst,$tmp2\n\t"
7551             "pshufd  $tmp,$tmp2,0x01\n\t"
7552             "vmulss  $dst,$dst,$tmp\n\t"
7553             "pshufd  $tmp,$tmp2,0x02\n\t"
7554             "vmulss  $dst,$dst,$tmp\n\t"
7555             "pshufd  $tmp,$tmp2,0x03\n\t"
7556             "vmulss  $dst,$dst,$tmp\n\t"
7557             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7558             "vmulss  $dst,$dst,$tmp2\n\t"
7559             "pshufd  $tmp,$tmp2,0x01\n\t"
7560             "vmulss  $dst,$dst,$tmp\n\t"
7561             "pshufd  $tmp,$tmp2,0x02\n\t"
7562             "vmulss  $dst,$dst,$tmp\n\t"
7563             "pshufd  $tmp,$tmp2,0x03\n\t"
7564             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
7565   ins_encode %{
7566     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7567     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7568     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7569     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7570     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7571     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7572     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7573     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7574     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7575     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7576     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7577     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7578     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7579     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7580     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7581     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7582     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7583     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7584     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7585     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7586     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7587     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7588     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7589     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7590     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7591     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7592     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7593     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7594     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7595     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7596     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7597   %}
7598   ins_pipe( pipe_slow );
7599 %}
7600 
7601 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
7602   predicate(UseSSE >= 1 && UseAVX == 0);
7603   match(Set dst (MulReductionVD dst src2));
7604   effect(TEMP dst, TEMP tmp);
7605   format %{ "mulsd   $dst,$src2\n\t"
7606             "pshufd  $tmp,$src2,0xE\n\t"
7607             "mulsd   $dst,$tmp\t! mul reduction2D" %}
7608   ins_encode %{
7609     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
7610     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7611     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
7612   %}
7613   ins_pipe( pipe_slow );
7614 %}
7615 
7616 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
7617   predicate(UseAVX > 0);
7618   match(Set dst (MulReductionVD dst src2));
7619   effect(TEMP tmp, TEMP dst);
7620   format %{ "vmulsd  $dst,$dst,$src2\n\t"
7621             "pshufd  $tmp,$src2,0xE\n\t"
7622             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
7623   ins_encode %{
7624     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7625     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7626     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7627   %}
7628   ins_pipe( pipe_slow );
7629 %}
7630 
7631 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
7632   predicate(UseAVX > 0);
7633   match(Set dst (MulReductionVD dst src2));
7634   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7635   format %{ "vmulsd  $dst,$dst,$src2\n\t"
7636             "pshufd  $tmp,$src2,0xE\n\t"
7637             "vmulsd  $dst,$dst,$tmp\n\t"
7638             "vextractf128_high  $tmp2,$src2\n\t"
7639             "vmulsd  $dst,$dst,$tmp2\n\t"
7640             "pshufd  $tmp,$tmp2,0xE\n\t"
7641             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
7642   ins_encode %{
7643     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7644     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7645     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7646     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7647     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7648     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7649     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7650   %}
7651   ins_pipe( pipe_slow );
7652 %}
7653 
7654 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
7655   predicate(UseAVX > 2);
7656   match(Set dst (MulReductionVD dst src2));
7657   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7658   format %{ "vmulsd  $dst,$dst,$src2\n\t"
7659             "pshufd  $tmp,$src2,0xE\n\t"
7660             "vmulsd  $dst,$dst,$tmp\n\t"
7661             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7662             "vmulsd  $dst,$dst,$tmp2\n\t"
7663             "pshufd  $tmp,$src2,0xE\n\t"
7664             "vmulsd  $dst,$dst,$tmp\n\t"
7665             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7666             "vmulsd  $dst,$dst,$tmp2\n\t"
7667             "pshufd  $tmp,$tmp2,0xE\n\t"
7668             "vmulsd  $dst,$dst,$tmp\n\t"
7669             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7670             "vmulsd  $dst,$dst,$tmp2\n\t"
7671             "pshufd  $tmp,$tmp2,0xE\n\t"
7672             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
7673   ins_encode %{
7674     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7675     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7676     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7677     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7678     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7679     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7680     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7681     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7682     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7683     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7684     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7685     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7686     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7687     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7688     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7689   %}
7690   ins_pipe( pipe_slow );
7691 %}
7692 
7693 //--------------------Min Reduction --------------------
7694 instruct rsmin8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
7695   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7696   match(Set dst (MinReductionV src1 src2));
7697   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
7698   format %{ "pshufd  $tmp,$src2,0x1\n\t"
7699             "pminsb  $tmp,$src2\n\t"
7700             "pextrb  $tmp2,$tmp, 0x1\n\t"
7701             "movsbl  $tmp2,$tmp2\n\t"
7702             "pextrb  $tmp3,$tmp,0x0\n\t"
7703             "movsbl  $tmp3,$tmp3\n\t"
7704             "cmpl  $tmp2,$tmp3\n\t"
7705             "cmovl  $tmp3,$tmp2\n\t"
7706             "cmpl  $src1,$tmp3\n\t"
7707             "cmovl  $tmp3,$src1, 0x0\n\t"
7708             "movl  $dst,$tmp2\n\t"
7709             "pextrb  $tmp2,$tmp\n\t"
7710             "movsbl  $tmp2,$tmp2\n\t"
7711             "pextrb  $tmp3,$tmp\n\t"
7712             "movsbl  $tmp3,$tmp3\n\t"
7713             "cmpl  $tmp2,$tmp3\n\t"
7714             "cmovl  $tmp3,$tmp2\n\t"
7715             "cmpl  $tmp3,$dst\n\t"
7716             "cmovl  $dst,$tmp3\t! min reduction4S" %}
7717   ins_encode %{
7718     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
7719     __ pminsb($tmp$$XMMRegister, $src2$$XMMRegister);
7720     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
7721     __ movsbl($tmp2$$Register, $tmp2$$Register);
7722     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
7723     __ movsbl($tmp3$$Register, $tmp3$$Register);
7724     __ cmpl($tmp2$$Register, $tmp3$$Register);
7725     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7726     __ cmpl($src1$$Register, $tmp3$$Register);
7727     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
7728     __ movl($dst$$Register, $tmp3$$Register);
7729     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
7730     __ movsbl($tmp2$$Register, $tmp2$$Register);
7731     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
7732     __ movsbl($tmp3$$Register, $tmp3$$Register);
7733     __ cmpl($tmp2$$Register, $tmp3$$Register);
7734     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7735     __ cmpl($tmp3$$Register, $dst$$Register);
7736     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
7737     __ movsbl($dst$$Register, $dst$$Register);
7738   %}
7739   ins_pipe( pipe_slow );
7740 %}
7741 
7742 instruct rsmin16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
7743   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7744   match(Set dst (MinReductionV src1 src2));
7745   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
7746   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
7747             "pminsb  $tmp4,$src2\n\t"
7748             "pshufd  $tmp,$tmp4,0x1\n\t"
7749             "pminsb  $tmp,$tmp4\n\t"
7750             "pextrb  $tmp2,$tmp, 0x1\n\t"
7751             "movsbl  $tmp2,$tmp2\n\t"
7752             "pextrb  $tmp3,$tmp,0x0\n\t"
7753             "movsbl  $tmp3,$tmp3\n\t"
7754             "cmpl  $tmp2,$tmp3\n\t"
7755             "cmovl  $tmp3,$tmp2\n\t"
7756             "cmpl  $src1,$tmp3\n\t"
7757             "cmovl  $tmp3,$src1, 0x0\n\t"
7758             "movl  $dst,$tmp2\n\t"
7759             "pextrb  $tmp2,$tmp\n\t"
7760             "movsbl  $tmp2,$tmp2\n\t"
7761             "pextrb  $tmp3,$tmp\n\t"
7762             "movsbl  $tmp3,$tmp3\n\t"
7763             "cmpl  $tmp2,$tmp3\n\t"
7764             "cmovl  $tmp3,$tmp2\n\t"
7765             "cmpl  $tmp3,$dst\n\t"
7766             "cmovl  $dst,$tmp3\t! min reduction4S" %}
7767   ins_encode %{
7768     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
7769     __ pminsb($tmp4$$XMMRegister, $src2$$XMMRegister);
7770     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
7771     __ pminsb($tmp$$XMMRegister, $tmp4$$XMMRegister);
7772     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
7773     __ movsbl($tmp2$$Register, $tmp2$$Register);
7774     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
7775     __ movsbl($tmp3$$Register, $tmp3$$Register);
7776     __ cmpl($tmp2$$Register, $tmp3$$Register);
7777     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7778     __ cmpl($src1$$Register, $tmp3$$Register);
7779     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
7780     __ movl($dst$$Register, $tmp3$$Register);
7781     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
7782     __ movsbl($tmp2$$Register, $tmp2$$Register);
7783     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
7784     __ movsbl($tmp3$$Register, $tmp3$$Register);
7785     __ cmpl($tmp2$$Register, $tmp3$$Register);
7786     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7787     __ cmpl($tmp3$$Register, $dst$$Register);
7788     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
7789     __ movsbl($dst$$Register, $dst$$Register);
7790   %}
7791   ins_pipe( pipe_slow );
7792 %}
7793 
7794 instruct rvmin16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
7795   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7796   match(Set dst (MinReductionV src1 src2));
7797   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
7798   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
7799             "vpminsb  $tmp,$tmp4,$src2\n\t"
7800             "pshufd  $tmp,$tmp4,0x1\n\t"
7801             "vpminsb  $tmp,$tmp,$tmp4\n\t"
7802             "pextrb  $tmp2,$tmp, 0x1\n\t"
7803             "movsbl  $tmp2,$tmp2\n\t"
7804             "pextrb  $tmp3,$tmp,0x0\n\t"
7805             "movsbl  $tmp3,$tmp3\n\t"
7806             "cmpl  $tmp2,$tmp3\n\t"
7807             "cmovl  $tmp3,$tmp2\n\t"
7808             "cmpl  $src1,$tmp3\n\t"
7809             "cmovl  $tmp3,$src1, 0x0\n\t"
7810             "movl  $dst,$tmp2\n\t"
7811             "pextrb  $tmp2,$tmp\n\t"
7812             "movsbl  $tmp2,$tmp2\n\t"
7813             "pextrb  $tmp3,$tmp\n\t"
7814             "movsbl  $tmp3,$tmp3\n\t"
7815             "cmpl  $tmp2,$tmp3\n\t"
7816             "cmovl  $tmp3,$tmp2\n\t"
7817             "cmpl  $tmp3,$dst\n\t"
7818             "cmovl  $dst,$tmp3\t! min reduction4S" %}
7819   ins_encode %{
7820     int vector_len = 0;
7821     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
7822     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 0);
7823     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
7824     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
7825     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
7826     __ movsbl($tmp2$$Register, $tmp2$$Register);
7827     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
7828     __ movsbl($tmp3$$Register, $tmp3$$Register);
7829     __ cmpl($tmp2$$Register, $tmp3$$Register);
7830     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7831     __ cmpl($src1$$Register, $tmp3$$Register);
7832     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
7833     __ movl($dst$$Register, $tmp3$$Register);
7834     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
7835     __ movsbl($tmp2$$Register, $tmp2$$Register);
7836     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
7837     __ movsbl($tmp3$$Register, $tmp3$$Register);
7838     __ cmpl($tmp2$$Register, $tmp3$$Register);
7839     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7840     __ cmpl($tmp3$$Register, $dst$$Register);
7841     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
7842     __ movsbl($dst$$Register, $dst$$Register);
7843   %}
7844   ins_pipe( pipe_slow );
7845 %}
7846 
7847 instruct rvmin32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
7848   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7849   match(Set dst (MinReductionV src1 src2));
7850   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
7851   format %{ "vextracti128_high  $tmp,$src2\n\t"
7852             "vpminsb  $tmp,$tmp,$src2\n\t"
7853             "pshufd  $tmp4,$tmp,0xE\n\t"
7854             "vpminsb  $tmp4,$tmp4,$tmp\n\t"
7855             "pshufd  $tmp,$tmp4,0x1\n\t"
7856             "vpminsb  $tmp,$tmp,$tmp4\n\t"
7857             "pextrb  $tmp2,$tmp, 0x1\n\t"
7858             "movsbl  $tmp2,$tmp2\n\t"
7859             "pextrb  $tmp3,$tmp,0x0\n\t"
7860             "movsbl  $tmp3,$tmp3\n\t"
7861             "cmpl  $tmp2,$tmp3\n\t"
7862             "cmovl  $tmp3,$tmp2\n\t"
7863             "cmpl  $src1,$tmp3\n\t"
7864             "cmovl  $tmp3,$src1, 0x0\n\t"
7865             "movl  $dst,$tmp2\n\t"
7866             "pextrb  $tmp2,$tmp\n\t"
7867             "movsbl  $tmp2,$tmp2\n\t"
7868             "pextrb  $tmp3,$tmp\n\t"
7869             "movsbl  $tmp3,$tmp3\n\t"
7870             "cmpl  $tmp2,$tmp3\n\t"
7871             "cmovl  $tmp3,$tmp2\n\t"
7872             "cmpl  $tmp3,$dst\n\t"
7873             "cmovl  $dst,$tmp3\t! min reduction4S" %}
7874   ins_encode %{
7875     int vector_len = 1;
7876     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7877     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
7878     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
7879     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
7880     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
7881     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
7882     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
7883     __ movsbl($tmp2$$Register, $tmp2$$Register);
7884     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
7885     __ movsbl($tmp3$$Register, $tmp3$$Register);
7886     __ cmpl($tmp2$$Register, $tmp3$$Register);
7887     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7888     __ cmpl($src1$$Register, $tmp3$$Register);
7889     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
7890     __ movl($dst$$Register, $tmp3$$Register);
7891     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
7892     __ movsbl($tmp2$$Register, $tmp2$$Register);
7893     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
7894     __ movsbl($tmp3$$Register, $tmp3$$Register);
7895     __ cmpl($tmp2$$Register, $tmp3$$Register);
7896     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7897     __ cmpl($tmp3$$Register, $dst$$Register);
7898     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
7899     __ movsbl($dst$$Register, $dst$$Register);
7900   %}
7901   ins_pipe( pipe_slow );
7902 %}
7903 
7904 instruct rvmin64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
7905   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7906   match(Set dst (MinReductionV src1 src2));
7907   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
7908   format %{ "vextracti64x4_high  $tmp4,$src2\n\t"
7909             "vpminsb  $tmp4,$tmp4,$src2\n\t"
7910             "vextracti128_high  $tmp,$tmp4\n\t"
7911             "vpminsb  $tmp,$tmp,$tmp4\n\t"
7912             "pshufd  $tmp4,$tmp,0xE\n\t"
7913             "vpminsb  $tmp4,$tmp4,$tmp\n\t"
7914             "pshufd  $tmp,$tmp4,0x1\n\t"
7915             "vpminsb  $tmp,$tmp,$tmp4\n\t"
7916             "pextrb  $tmp2,$tmp, 0x1\n\t"
7917             "movsbl  $tmp2,$tmp2\n\t"
7918             "pextrb  $tmp3,$tmp,0x0\n\t"
7919             "movsbl  $tmp3,$tmp3\n\t"
7920             "cmpl  $tmp2,$tmp3\n\t"
7921             "cmovl  $tmp3,$tmp2\n\t"
7922             "cmpl  $src1,$tmp3\n\t"
7923             "cmovl  $tmp3,$src1, 0x0\n\t"
7924             "movl  $dst,$tmp2\n\t"
7925             "pextrb  $tmp2,$tmp\n\t"
7926             "movsbl  $tmp2,$tmp2\n\t"
7927             "pextrb  $tmp3,$tmp\n\t"
7928             "movsbl  $tmp3,$tmp3\n\t"
7929             "cmpl  $tmp2,$tmp3\n\t"
7930             "cmovl  $tmp3,$tmp2\n\t"
7931             "cmpl  $tmp3,$dst\n\t"
7932             "cmovl  $dst,$tmp3\t! min reduction4S" %}
7933   ins_encode %{
7934     __ vextracti64x4_high($tmp4$$XMMRegister, $src2$$XMMRegister);
7935     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 2);
7936     __ vextracti128_high($tmp$$XMMRegister, $tmp4$$XMMRegister);
7937     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 1);
7938     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
7939     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
7940     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
7941     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
7942     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
7943     __ movsbl($tmp2$$Register, $tmp2$$Register);
7944     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
7945     __ movsbl($tmp3$$Register, $tmp3$$Register);
7946     __ cmpl($tmp2$$Register, $tmp3$$Register);
7947     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7948     __ cmpl($src1$$Register, $tmp3$$Register);
7949     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
7950     __ movl($dst$$Register, $tmp3$$Register);
7951     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
7952     __ movsbl($tmp2$$Register, $tmp2$$Register);
7953     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
7954     __ movsbl($tmp3$$Register, $tmp3$$Register);
7955     __ cmpl($tmp2$$Register, $tmp3$$Register);
7956     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7957     __ cmpl($tmp3$$Register, $dst$$Register);
7958     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
7959     __ movsbl($dst$$Register, $dst$$Register);
7960   %}
7961   ins_pipe( pipe_slow );
7962 %}
7963 
7964 instruct rsmin4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
7965   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7966   match(Set dst (MinReductionV src1 src2));
7967   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
7968   format %{ "pshufd  $tmp,$src2,0x1\n\t"
7969             "pminsw  $tmp,$src2\n\t"
7970             "pextrw  $tmp2,$tmp, 0x1\n\t"
7971             "movswl  $tmp2,$tmp2\n\t"
7972             "pextrb  $tmp3,$tmp, 0x0\n\t"
7973             "movswl  $tmp3,$tmp3,0x1\n\t"
7974             "cmpl  $tmp2,$tmp3\n\t"
7975             "cmovl  $tmp3,tmp2\n\t"
7976             "cmpl  $src1,$tmp3\n\t"
7977             "cmovl  $tmp3,$src1\n\t"
7978             "movswl  $dst,$tmp3\t! min reduction4S" %}
7979   ins_encode %{
7980     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
7981     __ pminsw($tmp$$XMMRegister, $src2$$XMMRegister);
7982     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
7983     __ movswl($tmp2$$Register, $tmp2$$Register);
7984     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
7985     __ movswl($tmp3$$Register, $tmp3$$Register);
7986     __ cmpl($tmp2$$Register, $tmp3$$Register);
7987     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
7988     __ cmpl($src1$$Register, $tmp3$$Register);
7989     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
7990     __ movl($dst$$Register, $tmp3$$Register);
7991   %}
7992   ins_pipe( pipe_slow );
7993 %}
7994 
7995 instruct rsmin8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
7996   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7997   match(Set dst (MinReductionV src1 src2));
7998   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
7999   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
8000             "pminsw  $tmp2,$src2\n\t"
8001             "pshufd  $tmp,$tmp2,0x1\n\t"
8002             "pminsw  $tmp,$tmp2\n\t"
8003             "pextrw  $tmp2,$tmp\n\t"
8004             "movswl  $tmp2,$tmp2\n\t"
8005             "pextrw  $tmp3,$tmp, 0x0\n\t"
8006             "movswl  $tmp3,$tmp3\n\t"
8007             "cmpl    $tmp2,$tmp3\n\t"
8008             "cmovl  $tmp3,$tmp2\n\t"
8009             "cmpl  $src1,$tmp3\n\t"
8010             "cmovl  $tmp3,$src1\n\t"
8011             "movl  $dst,$tmp3\t! min reduction8S" %}
8012   ins_encode %{
8013     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister,0xE);
8014     __ pminsw($tmp2$$XMMRegister, $src2$$XMMRegister);
8015     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8016     __ pminsw($tmp$$XMMRegister, $tmp2$$XMMRegister);
8017     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8018     __ movswl($tmp4$$Register, $tmp4$$Register);
8019     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8020     __ movswl($tmp3$$Register, $tmp3$$Register);
8021     __ cmpl($tmp4$$Register, $tmp3$$Register);
8022     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8023     __ cmpl($src1$$Register, $tmp3$$Register);
8024     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8025     __ movl($dst$$Register, $tmp3$$Register);
8026   %}
8027   ins_pipe( pipe_slow );
8028 %}
8029 
8030 instruct rvmin8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8031   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8032   match(Set dst (MinReductionV src1 src2));
8033   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8034   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8035             "vpminsw  $tmp,$tmp,$src2\n\t"
8036             "pshufd   $tmp2,$tmp,0x1\n\t"
8037             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8038             "movzwl   $dst,$src1\n\t"
8039             "pextrw   $tmp3,$tmp, 0x0\n\t"
8040             "vpminsw  $dst,$dst,$tmp3\n\t"
8041             "pextrw   $tmp3,$tmp, 0x1\n\t"
8042             "vpminsw  $dst,$dst,$tmp3\n\t"
8043             "movswl   $dst,$dst\t! min reduction8S" %}
8044   ins_encode %{
8045     int vector_len = 0;
8046     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8047     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8048     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8049     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8050     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8051     __ movswl($tmp4$$Register, $tmp4$$Register);
8052     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8053     __ movswl($tmp3$$Register, $tmp3$$Register);
8054     __ cmpl($tmp4$$Register, $tmp3$$Register);
8055     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8056     __ cmpl($src1$$Register, $tmp3$$Register);
8057     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8058     __ movl($dst$$Register, $tmp3$$Register);
8059   %}
8060   ins_pipe( pipe_slow );
8061 %}
8062 
8063 instruct rvmin16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8064   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8065   match(Set dst (MinReductionV src1 src2));
8066   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8067   format %{ "vextracti128_high  $tmp,$src2\n\t"
8068             "vpminsw  $tmp,$tmp,$src2\n\t"
8069             "pshufd  $tmp2,$tmp,0xE\n\t"
8070             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8071             "pshufd  $tmp2,$tmp,0x1\n\t"
8072             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8073             "pextrw  $tmp2,$tmp, 0x1\n\t"
8074             "movswl  $tmp2,$tmp2\n\t"
8075             "pextrw  $tmp3,$tmp, 0x0\n\t"
8076             "movswl  $tmp3,$tmp3\n\t"
8077             "cmpl  $tmp2$tmp3\n\t"
8078             "cmovl  $tmp3,$tmp2\n\t"
8079             "cmpl  $src1,$tmp3\n\t"
8080             "cmovl  $tmp3,$src1\n\t"
8081             "movl  $dst,$tmp3\t! min reduction16S" %}
8082   ins_encode %{
8083     int vector_len = 1;
8084     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8085     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8086     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8087     __ vpminsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8088     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8089     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8090     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8091     __ movswl($tmp4$$Register, $tmp4$$Register);
8092     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8093     __ movswl($tmp3$$Register, $tmp3$$Register);
8094     __ cmpl($tmp4$$Register, $tmp3$$Register);
8095     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8096     __ cmpl($src1$$Register, $tmp3$$Register);
8097     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8098     __ movl($dst$$Register, $tmp3$$Register);
8099   %}
8100   ins_pipe( pipe_slow );
8101 %}
8102 
8103 instruct rvmin32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8104   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8105   match(Set dst (MinReductionV src1 src2));
8106   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8107   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
8108             "vpminsw  $tmp2,$tmp2,$src2\n\t"
8109             "vextracti128_high  $tmp,$tmp2\n\t"
8110             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8111             "pshufd  $tmp2,$tmp,0xE\n\t"
8112             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8113             "pshufd  $tmp2,$tmp,0x1\n\t"
8114             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8115             "pextrw  $tmp3,$tmp, 0x0\n\t"
8116             "movswl  $dst,$src1\n\t"
8117             "pextrw  $tmp3,$tmp, 0x0\n\t"
8118             "movswl  $dst,$src1\n\t"
8119             "cmpl  $tmp2$tmp3\n\t"
8120             "cmovl  $tmp3,$tmp2\n\t"
8121             "cmpl  $src1,$tmp3\n\t"
8122             "cmovl  $tmp3,$src1\n\t"
8123             "movl  $dst,$dst\t! min reduction32S" %}
8124   ins_encode %{
8125     int vector_len = 2;
8126     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8127     __ vpminsw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8128     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
8129     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8130     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8131     __ vpminsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8132     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8133     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8134     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8135     __ movswl($tmp4$$Register, $tmp4$$Register);
8136     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8137     __ movswl($tmp3$$Register, $tmp3$$Register);
8138     __ cmpl($tmp4$$Register, $tmp3$$Register);
8139     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8140     __ cmpl($src1$$Register, $tmp3$$Register);
8141     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8142     __ movl($dst$$Register, $tmp3$$Register);
8143   %}
8144   ins_pipe( pipe_slow );
8145 %}
8146 
8147 instruct rsmin2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
8148   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8149   match(Set dst (MinReductionV src1 src2));
8150   effect(TEMP tmp, TEMP tmp2);
8151   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8152             "pminsd  $tmp,$src2\n\t"
8153             "movd    $tmp2,$src1\n\t"
8154             "pminsd  $tmp2,$tmp\n\t"
8155             "movd    $dst,$tmp2\t! min reduction2I" %}
8156   ins_encode %{
8157     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8158     __ pminsd($tmp$$XMMRegister, $src2$$XMMRegister);
8159     __ movdl($tmp2$$XMMRegister, $src1$$Register);
8160     __ pminsd($tmp2$$XMMRegister, $tmp$$XMMRegister);
8161     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8162   %}
8163   ins_pipe( pipe_slow );
8164 %}
8165 
8166 instruct rvmin2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
8167   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8168   match(Set dst (MinReductionV src1 src2));
8169   effect(TEMP tmp, TEMP tmp2);
8170   format %{ "pshufd   $tmp,$src2,0x1\n\t"
8171             "vpminsd  $tmp2,$tmp,$src2\n\t"
8172             "movd     $tmp,$src1\n\t"
8173             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8174             "movd     $dst,$tmp2\t! min reduction2I" %}
8175   ins_encode %{
8176     int vector_len = 0;
8177     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8178     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8179     __ movdl($tmp2$$XMMRegister, $src1$$Register);
8180     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8181     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8182   %}
8183   ins_pipe( pipe_slow );
8184 %}
8185 
8186 instruct rsmin4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8187   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8188   match(Set dst (MinReductionV src1 src2));
8189   effect(TEMP tmp, TEMP tmp2);
8190   format %{ "pshufd  $tmp,$src2,0xE\n\t"
8191             "pminsd  $tmp,$src2\n\t"
8192             "pshufd  $tmp2,$tmp,0x1\n\t"
8193             "pminsd  $tmp2,$tmp\n\t"
8194             "movd    $tmp,$src1\n\t"
8195             "pminsd  $tmp2,$tmp\n\t"
8196             "movd    $dst,$tmp2\t! min reduction4I" %}
8197   ins_encode %{
8198     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8199     __ pminsd($tmp$$XMMRegister, $src2$$XMMRegister);
8200     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
8201     __ pminsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
8202     __ movdl($tmp$$XMMRegister, $src1$$Register);
8203     __ pminsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
8204     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8205   %}
8206   ins_pipe( pipe_slow );
8207 %}
8208 
8209 instruct rvmin4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8210   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8211   match(Set dst (MinReductionV src1 src2));
8212   effect(TEMP tmp, TEMP tmp2);
8213   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8214             "vpminsd  $tmp2,$tmp,$src2\n\t"
8215             "pshufd   $tmp,$tmp2,0x1\n\t"
8216             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8217             "movd     $tmp,$src1\n\t"
8218             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8219             "movd     $dst,$tmp2\t! min reduction4I" %}
8220   ins_encode %{
8221     int vector_len = 0;
8222     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8223     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8224     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8225     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8226     __ movdl($tmp$$XMMRegister, $src1$$Register);
8227     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8228     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8229   %}
8230   ins_pipe( pipe_slow );
8231 %}
8232 
8233 instruct rvmin4I_reduction_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8234   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8235   match(Set dst (MinReductionV src1 src2));
8236   effect(TEMP tmp, TEMP tmp2);
8237   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8238             "vpminsd  $tmp2,$tmp,$src2\n\t"
8239             "pshufd   $tmp,$tmp2,0x1\n\t"
8240             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8241             "movd     $tmp,$src1\n\t"
8242             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8243             "movd     $dst,$tmp2\t! min reduction4I" %}
8244   ins_encode %{
8245     int vector_len = 0;
8246     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8247     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8248     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8249     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8250     __ movdl($tmp$$XMMRegister, $src1$$Register);
8251     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8252     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8253   %}
8254   ins_pipe( pipe_slow );
8255 %}
8256 
8257 instruct rvmin8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
8258   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8259   match(Set dst (MinReductionV src1 src2));
8260   effect(TEMP tmp, TEMP tmp2);
8261   format %{ "vextracti128_high   $tmp,$src2\n\t"
8262             "vpminsd  $tmp,$tmp,$src2\n\t"
8263             "pshufd   $tmp2,$tmp,0xE\n\t"
8264             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8265             "pshufd   $tmp,$tmp2,0x1\n\t"
8266             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8267             "movd     $tmp,$src1\n\t"
8268             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8269             "movd     $dst,$tmp2\t! min reduction8I" %}
8270   ins_encode %{
8271     int vector_len = 1;
8272     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8273     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8274     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8275     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8276     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8277     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8278     __ movdl($tmp$$XMMRegister, $src1$$Register);
8279     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8280     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8281   %}
8282   ins_pipe( pipe_slow );
8283 %}
8284 
8285 instruct rvmin8I_reduction_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
8286   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8287   match(Set dst (MinReductionV src1 src2));
8288   effect(TEMP tmp, TEMP tmp2);
8289   format %{ "vextracti128_high   $tmp,$src2\n\t"
8290             "vpminsd  $tmp,$tmp,$src2\n\t"
8291             "pshufd   $tmp2,$tmp,0xE\n\t"
8292             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8293             "pshufd   $tmp,$tmp2,0x1\n\t"
8294             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8295             "movd     $tmp,$src1\n\t"
8296             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8297             "movd     $dst,$tmp2\t! min reduction8I" %}
8298   ins_encode %{
8299     int vector_len = 1;
8300     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8301     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8302     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8303     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8304     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8305     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8306     __ movdl($tmp$$XMMRegister, $src1$$Register);
8307     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8308     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8309   %}
8310   ins_pipe( pipe_slow );
8311 %}
8312 
8313 instruct rvmin16I_reduction_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
8314   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8315   match(Set dst (MinReductionV src1 src2));
8316   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8317   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
8318             "vpminsd  $tmp3,$tmp3,$src2\n\t"
8319             "vextracti128_high   $tmp,$tmp3\n\t"
8320             "vpminsd  $tmp,$tmp,$tmp3\n\t"
8321             "pshufd   $tmp2,$tmp,0xE\n\t"
8322             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8323             "pshufd   $tmp,$tmp2,0x1\n\t"
8324             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8325             "movd     $tmp,$src1\n\t"
8326             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8327             "movd     $dst,$tmp2\t! min reduction16I" %}
8328   ins_encode %{
8329     int vector_len = 2;
8330     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8331     __ vpminsd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
8332     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
8333     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8334     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8335     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8336     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8337     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8338     __ movdl($tmp$$XMMRegister, $src1$$Register);
8339     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8340     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8341   %}
8342   ins_pipe( pipe_slow );
8343 %}
8344 
8345 // Long Min Reduction
8346 instruct rsmin1L_reduction_reg(rRegL dst, rRegL src1, vecD src2, rxmm0 tmp, regF tmp2) %{
8347   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8348   match(Set dst (MinReductionV src1 src2));
8349   effect(TEMP tmp, TEMP tmp2);
8350   format %{ "movdq      $tmp,$src1\n\t"
8351             "movdq      $tmp2,$src1\n\t"
8352             "pcmpgtq   $tmp,$src2\n\t"
8353             "blendvpd  $tmp2,$src2\n\t"
8354             "movdq      $dst,$tmp2\t! min reduction1L" %}
8355   ins_encode %{
8356     __ movdq($tmp$$XMMRegister,$src1$$Register);
8357     __ movdq($tmp2$$XMMRegister,$src1$$Register);
8358     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
8359     __ blendvpd($tmp2$$XMMRegister,$src2$$XMMRegister);
8360     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8361   %}
8362   ins_pipe( pipe_slow );
8363 %}
8364 
8365 instruct rsmin2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, rxmm0 xmm_0, regF tmp2, regF tmp3) %{
8366   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8367   match(Set dst (MinReductionV src1 src2));
8368   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
8369   format %{ "pshufd  $tmp3,$src2,0xE\n\t"
8370             "movdqu  $xmm_0,$src2\n\t"
8371             "movdqu  $tmp2,$src2\n\t"
8372             "pcmpgtq  $xmm_0,$tmp3\n\t"
8373             "blendvpd  $tmp2,$tmp3\n\t"
8374             "movdqu  $xmm_0,$tmp2\n\t"
8375             "movdq  $tmp3,$src1\n\t"
8376             "pcmpgtq  $xmm_0,$tmp3\n\t"
8377             "blendvpd  $tmp2,$tmp3\n\t"
8378             "movq  $dst,$tmp2\t! min reduction2L" %}
8379   ins_encode %{
8380     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 0xE);
8381     __ movdqu($xmm_0$$XMMRegister, $src2$$XMMRegister);
8382     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
8383     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
8384     __ blendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8385     __ movdqu($xmm_0$$XMMRegister, $tmp2$$XMMRegister);
8386     __ movdq($tmp3$$XMMRegister, $src1$$Register);
8387     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
8388     __ blendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister);
8389     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8390   %}
8391   ins_pipe( pipe_slow );
8392 %}
8393 
8394 instruct rvmin2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2, regF tmp3) %{
8395   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8396   match(Set dst (MinReductionV src1 src2));
8397   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8398   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
8399             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
8400             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
8401             "movq     $tmp,$src1\n\t"
8402             "vpcmpgtq  $tmp3,$tmp2,$tmp\n\t"
8403             "blendvpd   $tmp2,$tmp2,$src1,$tmp3\n\t"
8404             "movq     $dst,$tmp2\t! min reduction2L" %}
8405   ins_encode %{
8406     int vector_len = 0;
8407     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
8408     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8409     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8410     __ movdq($tmp$$XMMRegister,$src1$$Register);
8411     __ vpcmpgtq($tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8412     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister,$src1$$XMMRegister,$tmp3$$XMMRegister, vector_len);
8413     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8414   %}
8415   ins_pipe( pipe_slow );
8416 %}
8417 
8418 instruct rvmin4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
8419   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8420   match(Set dst (MinReductionV src1 src2));
8421   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8422   format %{ "vextracti128_high   $tmp2,$src2\n\t"
8423             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
8424             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
8425             "vpshufd   $tmp3, $tmp2,0x1\n\t"
8426             "vpcmpgtq  $tmp, $tmp3,$tmp\n\t2"
8427             "vblendvpd $tmp3,$tmp3,$tmp2,$tmp\n\t"
8428             "movq     $tmp2,$src1\n\t"
8429             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8430             "blendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
8431             "movq     $dst,$tmp2\t! min reduction2L" %}
8432   ins_encode %{
8433     int vector_len = 1;
8434     __ vextracti128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8435     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8436     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8437     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8438     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8439     __ vblendvpd($tmp3$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8440     __ movdq($tmp$$XMMRegister,$src1$$Register);
8441     __ vpcmpgtq($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
8442     __ vblendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister,$tmp$$XMMRegister,$tmp2$$XMMRegister, vector_len);
8443     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8444   %}
8445   ins_pipe( pipe_slow );
8446 %}
8447 
8448 instruct rvmin8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
8449   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8450   match(Set dst (MinReductionV src1 src2));
8451   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8452   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
8453             "vpcmpgtq  $tmp,$tmp3,$src2\n\t"
8454             "vblendvpd   $tmp3,$tmp3,$src2,$tmp\n\t"
8455             "vextracti128_high   $tmp2,$tmp3\n\t"
8456             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8457             "vblendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
8458             "vpshufd  $tmp3,$tmp2,0x1\n\t"
8459             "vpcmpgtq   $tmp,$tmp3,$tmp2\n\t"
8460             "vblendvpd  $tmp3,$tmp3,$tmp2,$tmp\n\t"
8461             "movq     $tmp2,$src1\n\t"
8462             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8463             "vblendvpd  $tmp2,$tmp2,$tmp3,$tmp\n\t"
8464             "movq     $dst,$tmp2\t! min reduction4I" %}
8465   ins_encode %{
8466     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8467     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
8468     __ vblendvpd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, 1);
8469     __ vextracti128_high($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8470     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, 1);
8471     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, 1);
8472     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8473     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, 1);
8474     __ vblendvpd($tmp3$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, 1);
8475     __ movdq($tmp2$$XMMRegister, $src1$$Register);
8476     __ vpcmpgtq($tmp$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, 1);
8477     __ vblendvpd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, 1);
8478     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8479   %}
8480   ins_pipe( pipe_slow );
8481 %}
8482 
8483 // Float Min Reduction
8484 instruct rsmin2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
8485   predicate(UseSSE > 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8486   match(Set dst (MinReductionV dst src2));
8487   effect(TEMP dst, TEMP tmp);
8488   format %{ "minps   $dst, $src2\n\t"
8489             "pshufd  $tmp,$src2,0x1\n\t"
8490             "minps   $dst,$tmp\t! min reduction2F" %}
8491   ins_encode %{
8492     __ minps($dst$$XMMRegister, $src2$$XMMRegister);
8493     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8494     __ minps($dst$$XMMRegister, $tmp$$XMMRegister);
8495   %}
8496   ins_pipe( pipe_slow );
8497 %}
8498 
8499 instruct rvmin2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
8500   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8501   match(Set dst (MinReductionV dst src2));
8502   effect(TEMP dst, TEMP tmp);
8503   format %{ "vminps  $dst,$dst,$src2\n\t"
8504             "pshufd  $tmp,$src2,0x1\n\t"
8505             "vminps  $dst,$dst,$tmp\t! min reduction2F" %}
8506   ins_encode %{
8507     int vector_len = 0;
8508     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8509     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8510     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8511   %}
8512   ins_pipe( pipe_slow );
8513 %}
8514 
8515 instruct rsmin4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
8516   predicate(UseSSE > 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8517   match(Set dst (MinReductionV dst src2));
8518   effect(TEMP tmp, TEMP dst);
8519   format %{ "minps  $dst,$src2\n\t"
8520             "pshufd  $tmp,$src2,0x1\n\t"
8521             "minps  $dst,tmp\n\t"
8522             "pshufd  $tmp,$src2,0x2\n\t"
8523             "minps  $dst,tmp\n\t"
8524             "pshufd  $tmp,$src2,0x3\n\t"
8525             "minps  $dst,$tmp\t! min reduction4F" %}
8526   ins_encode %{
8527     int vector_len = 0;
8528     __ minps($dst$$XMMRegister, $src2$$XMMRegister);
8529     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8530     __ minps($dst$$XMMRegister, $tmp$$XMMRegister);
8531     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x2);
8532     __ minps($dst$$XMMRegister, $tmp$$XMMRegister);
8533     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x3);
8534     __ minps($dst$$XMMRegister, $tmp$$XMMRegister);
8535   %}
8536   ins_pipe( pipe_slow );
8537 %}
8538 
8539 instruct rvmin4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
8540   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8541   match(Set dst (MinReductionV dst src2));
8542   effect(TEMP tmp, TEMP dst);
8543   format %{ "vminps  $dst,$dst,$src2\n\t"
8544             "pshufd  $tmp,$src2,0x1\n\t"
8545             "vminps  $dst,$dst,tmp\n\t"
8546             "pshufd  $tmp,$src2,0x2\n\t"
8547             "vminps  $dst,$dst,tmp\n\t"
8548             "pshufd  $tmp,$src2,0x3\n\t"
8549             "vminps  $dst,$dst,$tmp\t! min reduction4F" %}
8550   ins_encode %{
8551     int vector_len = 0;
8552     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8553     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8554     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8555     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x2);
8556     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8557     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x3);
8558     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8559   %}
8560   ins_pipe( pipe_slow );
8561 %}
8562 
8563 instruct rvmin8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
8564   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8565   match(Set dst (MinReductionV dst src2));
8566   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8567   format %{ "vminps  $dst,$dst,$src2\n\t"
8568             "pshufd  $tmp,$src2,0x01\n\t"
8569             "vminps  $dst,$dst,$tmp\n\t"
8570             "pshufd  $tmp,$src2,0x02\n\t"
8571             "vminps  $dst,$dst,$tmp\n\t"
8572             "pshufd  $tmp,$src2,0x03\n\t"
8573             "vminps  $dst,$dst,$tmp\n\t"
8574             "vextractf128_high  $tmp2,$src2\n\t"
8575             "vminps  $dst,$dst,$tmp2\n\t"
8576             "pshufd  $tmp,$tmp2,0x01\n\t"
8577             "vminps  $dst,$dst,$tmp\n\t"
8578             "pshufd  $tmp,$tmp2,0x02\n\t"
8579             "vminps  $dst,$dst,$tmp\n\t"
8580             "pshufd  $tmp,$tmp2,0x03\n\t"
8581             "vminps  $dst,$dst,$tmp\t! sub reduction8F" %}
8582   ins_encode %{
8583     int vector_len = 1;
8584     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8585     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
8586     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8587     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
8588     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8589     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
8590     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8591     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8592     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8593     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8594     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8595     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8596     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8597     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8598     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8599   %}
8600   ins_pipe( pipe_slow );
8601 %}
8602 
8603 instruct rvmin16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
8604   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8605   match(Set dst (MinReductionV dst src2));
8606   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8607   format %{ "vminps  $dst,$dst,$src2\n\t"
8608             "pshufd  $tmp,$src2,0x01\n\t"
8609             "vminps  $dst,$dst,$tmp\n\t"
8610             "pshufd  $tmp,$src2,0x02\n\t"
8611             "vminps  $dst,$dst,$tmp\n\t"
8612             "pshufd  $tmp,$src2,0x03\n\t"
8613             "vminps  $dst,$dst,$tmp\n\t"
8614             "vextractf32x4  $tmp2,$src2,0x1\n\t"
8615             "vminps  $dst,$dst,$tmp2\n\t"
8616             "pshufd  $tmp,$tmp2,0x01\n\t"
8617             "vminps  $dst,$dst,$tmp\n\t"
8618             "pshufd  $tmp,$tmp2,0x02\n\t"
8619             "vminps  $dst,$dst,$tmp\n\t"
8620             "pshufd  $tmp,$tmp2,0x03\n\t"
8621             "vminps  $dst,$dst,$tmp\n\t"
8622             "vextractf32x4  $tmp2,$src2,0x2\n\t"
8623             "vminps  $dst,$dst,$tmp2\n\t"
8624             "pshufd  $tmp,$tmp2,0x01\n\t"
8625             "vminps  $dst,$dst,$tmp\n\t"
8626             "pshufd  $tmp,$tmp2,0x02\n\t"
8627             "vminps  $dst,$dst,$tmp\n\t"
8628             "pshufd  $tmp,$tmp2,0x03\n\t"
8629             "vminps  $dst,$dst,$tmp\n\t"
8630             "vextractf32x4  $tmp2,$src2,0x3\n\t"
8631             "vminps  $dst,$dst,$tmp2\n\t"
8632             "pshufd  $tmp,$tmp2,0x01\n\t"
8633             "vminps  $dst,$dst,$tmp\n\t"
8634             "pshufd  $tmp,$tmp2,0x02\n\t"
8635             "vminps  $dst,$dst,$tmp\n\t"
8636             "pshufd  $tmp,$tmp2,0x03\n\t"
8637             "vminps  $dst,$dst,$tmp\t! sub reduction16F" %}
8638   ins_encode %{
8639     int vector_len = 2;
8640     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8641     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
8642     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8643     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
8644     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8645     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
8646     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8647     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8648     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8649     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8650     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8651     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8652     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8653     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8654     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8655     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
8656     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8657     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8658     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8659     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8660     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8661     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8662     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8663     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
8664     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8665     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8666     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8667     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8668     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8669     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8670     __ vminps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8671   %}
8672   ins_pipe( pipe_slow );
8673 %}
8674 
8675 instruct rsmin2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
8676   predicate(UseSSE >= 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
8677   match(Set dst (MinReductionV dst src2));
8678   effect(TEMP tmp, TEMP dst);
8679   format %{ "minpd   $dst,$src2\n\t"
8680             "pshufd  $tmp,$src2,0xE\n\t"
8681             "minpd   $dst,$tmp\t! min reduction2D" %}
8682   ins_encode %{
8683     __ minpd($dst$$XMMRegister, $src2$$XMMRegister);
8684     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8685     __ minpd($dst$$XMMRegister, $tmp$$XMMRegister);
8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 instruct rvmin2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
8691   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
8692   match(Set dst (MinReductionV dst src2));
8693   effect(TEMP tmp, TEMP dst);
8694   format %{ "vminpd  $dst,$dst,$src2\n\t"
8695             "pshufd  $tmp,$src2,0xE\n\t"
8696             "vminpd  $dst,$dst,$tmp\t! min reduction2D" %}
8697   ins_encode %{
8698     int vector_len = 0;
8699     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8700     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8701     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8702   %}
8703   ins_pipe( pipe_slow );
8704 %}
8705 
8706 instruct rvmin4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
8707   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
8708   match(Set dst (MinReductionV dst src2));
8709   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8710   format %{ "vminpd  $dst,$dst,$src2\n\t"
8711             "pshufd  $tmp,$src2,0xE\n\t"
8712             "vminpd  $dst,$dst,$tmp\n\t"
8713             "vextractf32x4  $tmp2,$src2,0x1\n\t"
8714             "vminpd  $dst,$dst,$tmp2\n\t"
8715             "pshufd  $tmp,$tmp2,0xE\n\t"
8716             "vminpd  $dst,$dst,$tmp\t! min reduction4D" %}
8717   ins_encode %{
8718     int vector_len = 1;
8719     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8720     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8721     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8722     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8723     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8724     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8725     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8726   %}
8727   ins_pipe( pipe_slow );
8728 %}
8729 
8730 instruct rvmin8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
8731   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
8732   match(Set dst (MinReductionV dst src2));
8733   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8734   format %{ "vminpd  $dst,$dst,$src2\n\t"
8735             "pshufd  $tmp,$src2,0xE\n\t"
8736             "vminpd  $dst,$dst,$tmp\n\t"
8737             "vextractf32x4  $tmp2,$src2,0x1\n\t"
8738             "vminpd  $dst,$dst,$tmp2\n\t"
8739             "pshufd  $tmp,$tmp2,0xE\n\t"
8740             "vminpd  $dst,$dst,$tmp\n\t"
8741             "vextractf32x4  $tmp2,$src2,0x2\n\t"
8742             "vminpd  $dst,$dst,$tmp2\n\t"
8743             "pshufd  $tmp,$tmp2,0xE\n\t"
8744             "vminpd  $dst,$dst,$tmp\n\t"
8745             "vextractf32x4  $tmp2,$src2,0x3\n\t"
8746             "vminpd  $dst,$dst,$tmp2\n\t"
8747             "pshufd  $tmp,$tmp2,0xE\n\t"
8748             "vminpd  $dst,$dst,$tmp\t! min reduction8D" %}
8749   ins_encode %{
8750     int vector_len = 2;
8751     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
8752     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8753     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8754     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8755     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8756     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8757     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8758     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
8759     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8760     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8761     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8762     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
8763     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8764     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8765     __ vminpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8766   %}
8767   ins_pipe( pipe_slow );
8768 %}
8769 
8770 // ------- Max Reduction ------------
8771 
8772 instruct rsmax8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8773   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8774   match(Set dst (MaxReductionV src1 src2));
8775   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8776   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8777             "pminsb  $tmp,$src2\n\t"
8778             "pextrb  $tmp2,$tmp, 0x1\n\t"
8779             "movsbl  $tmp2,$tmp2\n\t"
8780             "pextrb  $tmp3,$tmp,0x0\n\t"
8781             "movsbl  $tmp3,$tmp3\n\t"
8782             "cmpl  $tmp2,$tmp3\n\t"
8783             "cmovl  $tmp3,$tmp2\n\t"
8784             "cmpl  $src1,$tmp3\n\t"
8785             "cmovl  $tmp3,$src1, 0x0\n\t"
8786             "movl  $dst,$tmp2\n\t"
8787             "pextrb  $tmp2,$tmp\n\t"
8788             "movsbl  $tmp2,$tmp2\n\t"
8789             "pextrb  $tmp3,$tmp\n\t"
8790             "movsbl  $tmp3,$tmp3\n\t"
8791             "cmpl  $tmp2,$tmp3\n\t"
8792             "cmovl  $tmp3,$tmp2\n\t"
8793             "cmpl  $tmp3,$dst\n\t"
8794             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8795   ins_encode %{
8796     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
8797     __ pmaxsb($tmp$$XMMRegister, $src2$$XMMRegister);
8798     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8799     __ movsbl($tmp2$$Register, $tmp2$$Register);
8800     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8801     __ movsbl($tmp3$$Register, $tmp3$$Register);
8802     __ cmpl($tmp2$$Register, $tmp3$$Register);
8803     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8804     __ cmpl($src1$$Register, $tmp3$$Register);
8805     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
8806     __ movl($dst$$Register, $tmp3$$Register);
8807     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8808     __ movsbl($tmp2$$Register, $tmp2$$Register);
8809     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8810     __ movsbl($tmp3$$Register, $tmp3$$Register);
8811     __ cmpl($tmp2$$Register, $tmp3$$Register);
8812     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8813     __ cmpl($tmp3$$Register, $dst$$Register);
8814     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
8815     __ movsbl($dst$$Register, $dst$$Register);
8816   %}
8817   ins_pipe( pipe_slow );
8818 %}
8819 
8820 instruct rsmax16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8821   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8822   match(Set dst (MaxReductionV src1 src2));
8823   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8824   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
8825             "pmaxsb  $tmp4,$src2\n\t"
8826             "pshufd  $tmp,$tmp4,0x1\n\t"
8827             "pmaxsb  $tmp,$tmp4\n\t"
8828             "pextrb  $tmp2,$tmp, 0x1\n\t"
8829             "movsbl  $tmp2,$tmp2\n\t"
8830             "pextrb  $tmp3,$tmp,0x0\n\t"
8831             "movsbl  $tmp3,$tmp3\n\t"
8832             "cmpl  $tmp2,$tmp3\n\t"
8833             "cmovl  $tmp3,$tmp2\n\t"
8834             "cmpl  $src1,$tmp3\n\t"
8835             "cmovl  $tmp3,$src1, 0x0\n\t"
8836             "movl  $dst,$tmp2\n\t"
8837             "pextrb  $tmp2,$tmp\n\t"
8838             "movsbl  $tmp2,$tmp2\n\t"
8839             "pextrb  $tmp3,$tmp\n\t"
8840             "movsbl  $tmp3,$tmp3\n\t"
8841             "cmpl  $tmp2,$tmp3\n\t"
8842             "cmovl  $tmp3,$tmp2\n\t"
8843             "cmpl  $tmp3,$dst\n\t"
8844             "cmovl  $dst,$tmp3\t! max reduction4S" %}
8845   ins_encode %{
8846     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
8847     __ pmaxsb($tmp4$$XMMRegister, $src2$$XMMRegister);
8848     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8849     __ pmaxsb($tmp$$XMMRegister, $tmp4$$XMMRegister);
8850     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8851     __ movsbl($tmp2$$Register, $tmp2$$Register);
8852     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8853     __ movsbl($tmp3$$Register, $tmp3$$Register);
8854     __ cmpl($tmp2$$Register, $tmp3$$Register);
8855     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8856     __ cmpl($src1$$Register, $tmp3$$Register);
8857     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
8858     __ movl($dst$$Register, $tmp3$$Register);
8859     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8860     __ movsbl($tmp2$$Register, $tmp2$$Register);
8861     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8862     __ movsbl($tmp3$$Register, $tmp3$$Register);
8863     __ cmpl($tmp2$$Register, $tmp3$$Register);
8864     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8865     __ cmpl($tmp3$$Register, $dst$$Register);
8866     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
8867     __ movsbl($dst$$Register, $dst$$Register);
8868   %}
8869   ins_pipe( pipe_slow );
8870 %}
8871 
8872 instruct rvmax16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8873   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8874   match(Set dst (MaxReductionV src1 src2));
8875   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8876   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
8877             "vpmaxsb  $tmp,$tmp4,$src2\n\t"
8878             "pshufd  $tmp,$tmp4,0x1\n\t"
8879             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
8880             "pextrb  $tmp2,$tmp, 0x1\n\t"
8881             "movsbl  $tmp2,$tmp2\n\t"
8882             "pextrb  $tmp3,$tmp,0x0\n\t"
8883             "movsbl  $tmp3,$tmp3\n\t"
8884             "cmpl  $tmp2,$tmp3\n\t"
8885             "cmovl  $tmp3,$tmp2\n\t"
8886             "cmpl  $src1,$tmp3\n\t"
8887             "cmovl  $tmp3,$src1, 0x0\n\t"
8888             "movl  $dst,$tmp2\n\t"
8889             "pextrb  $tmp2,$tmp\n\t"
8890             "movsbl  $tmp2,$tmp2\n\t"
8891             "pextrb  $tmp3,$tmp\n\t"
8892             "movsbl  $tmp3,$tmp3\n\t"
8893             "cmpl  $tmp2,$tmp3\n\t"
8894             "cmovl  $tmp3,$tmp2\n\t"
8895             "cmpl  $tmp3,$dst\n\t"
8896             "cmovl  $dst,$tmp3\t! max reduction4S" %}
8897   ins_encode %{
8898     int vector_len = 0;
8899     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
8900     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 0);
8901     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8902     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8903     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8904     __ movsbl($tmp2$$Register, $tmp2$$Register);
8905     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8906     __ movsbl($tmp3$$Register, $tmp3$$Register);
8907     __ cmpl($tmp2$$Register, $tmp3$$Register);
8908     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8909     __ cmpl($src1$$Register, $tmp3$$Register);
8910     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
8911     __ movl($dst$$Register, $tmp3$$Register);
8912     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8913     __ movsbl($tmp2$$Register, $tmp2$$Register);
8914     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8915     __ movsbl($tmp3$$Register, $tmp3$$Register);
8916     __ cmpl($tmp2$$Register, $tmp3$$Register);
8917     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8918     __ cmpl($tmp3$$Register, $dst$$Register);
8919     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
8920     __ movsbl($dst$$Register, $dst$$Register);
8921   %}
8922   ins_pipe( pipe_slow );
8923 %}
8924 
8925 instruct rvmax32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8926   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8927   match(Set dst (MaxReductionV src1 src2));
8928   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8929   format %{ "vextracti128_high  $tmp,$src2\n\t"
8930             "vpmaxsb  $tmp,$tmp,$src2\n\t"
8931             "pshufd  $tmp4,$tmp,0xE\n\t"
8932             "vpmaxsb  $tmp4,$tmp4,$tmp\n\t"
8933             "pshufd  $tmp,$tmp4,0x1\n\t"
8934             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
8935             "pextrb  $tmp2,$tmp, 0x1\n\t"
8936             "movsbl  $tmp2,$tmp2\n\t"
8937             "pextrb  $tmp3,$tmp,0x0\n\t"
8938             "movsbl  $tmp3,$tmp3\n\t"
8939             "cmpl  $tmp2,$tmp3\n\t"
8940             "cmovl  $tmp3,$tmp2\n\t"
8941             "cmpl  $src1,$tmp3\n\t"
8942             "cmovl  $tmp3,$src1, 0x0\n\t"
8943             "movl  $dst,$tmp2\n\t"
8944             "pextrb  $tmp2,$tmp\n\t"
8945             "movsbl  $tmp2,$tmp2\n\t"
8946             "pextrb  $tmp3,$tmp\n\t"
8947             "movsbl  $tmp3,$tmp3\n\t"
8948             "cmpl  $tmp2,$tmp3\n\t"
8949             "cmovl  $tmp3,$tmp2\n\t"
8950             "cmpl  $tmp3,$dst\n\t"
8951             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8952   ins_encode %{
8953     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8954     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
8955     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
8956     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
8957     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8958     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8959     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8960     __ movsbl($tmp2$$Register, $tmp2$$Register);
8961     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8962     __ movsbl($tmp3$$Register, $tmp3$$Register);
8963     __ cmpl($tmp2$$Register, $tmp3$$Register);
8964     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8965     __ cmpl($src1$$Register, $tmp3$$Register);
8966     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
8967     __ movl($dst$$Register, $tmp3$$Register);
8968     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8969     __ movsbl($tmp2$$Register, $tmp2$$Register);
8970     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8971     __ movsbl($tmp3$$Register, $tmp3$$Register);
8972     __ cmpl($tmp2$$Register, $tmp3$$Register);
8973     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
8974     __ cmpl($tmp3$$Register, $dst$$Register);
8975     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
8976     __ movsbl($dst$$Register, $dst$$Register);
8977   %}
8978   ins_pipe( pipe_slow );
8979 %}
8980 
8981 instruct rvmax64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8982   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8983   match(Set dst (MaxReductionV src1 src2));
8984   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8985   format %{ "vextracti64x4_high  $tmp4,$src2\n\t"
8986             "vpmaxsb  $tmp4,$tmp4,$src2\n\t"
8987             "vextracti128_high  $tmp,$tmp4\n\t"
8988             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
8989             "pshufd  $tmp4,$tmp,0xE\n\t"
8990             "vpmaxsb  $tmp,$tmp4,$tmp\n\t"
8991             "pshufd  $tmp4,$src2,0xE\n\t"
8992             "vpmaxsb  $tmp,$tmp4,$src2\n\t"
8993             "pshufd  $tmp,$tmp4,0x1\n\t"
8994             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
8995             "pextrb  $tmp2,$tmp, 0x1\n\t"
8996             "movsbl  $tmp2,$tmp2\n\t"
8997             "pextrb  $tmp3,$tmp,0x0\n\t"
8998             "movsbl  $tmp3,$tmp3\n\t"
8999             "cmpl  $tmp2,$tmp3\n\t"
9000             "cmovl  $tmp3,$tmp2\n\t"
9001             "cmpl  $src1,$tmp3\n\t"
9002             "cmovl  $tmp3,$src1, 0x0\n\t"
9003             "movl  $dst,$tmp2\n\t"
9004             "pextrb  $tmp2,$tmp\n\t"
9005             "movsbl  $tmp2,$tmp2\n\t"
9006             "pextrb  $tmp3,$tmp\n\t"
9007             "movsbl  $tmp3,$tmp3\n\t"
9008             "cmpl  $tmp2,$tmp3\n\t"
9009             "cmovl  $tmp3,$tmp2\n\t"
9010             "cmpl  $tmp3,$dst\n\t"
9011             "cmovl  $dst,$tmp3\t! max reduction32B" %}
9012   ins_encode %{
9013     __ vextracti64x4_high($tmp4$$XMMRegister, $src2$$XMMRegister);
9014     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 1);
9015     __ vextracti128_high($tmp$$XMMRegister, $tmp4$$XMMRegister);
9016     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9017     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
9018     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
9019     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9020     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9021     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9022     __ movsbl($tmp2$$Register, $tmp2$$Register);
9023     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9024     __ movsbl($tmp3$$Register, $tmp3$$Register);
9025     __ cmpl($tmp2$$Register, $tmp3$$Register);
9026     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9027     __ cmpl($src1$$Register, $tmp3$$Register);
9028     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9029     __ movl($dst$$Register, $tmp3$$Register);
9030     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9031     __ movsbl($tmp2$$Register, $tmp2$$Register);
9032     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9033     __ movsbl($tmp3$$Register, $tmp3$$Register);
9034     __ cmpl($tmp2$$Register, $tmp3$$Register);
9035     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9036     __ cmpl($tmp3$$Register, $dst$$Register);
9037     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9038     __ movsbl($dst$$Register, $dst$$Register);
9039   %}
9040   ins_pipe( pipe_slow );
9041 %}
9042 
9043 instruct rsmax4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
9044   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9045   match(Set dst (MaxReductionV src1 src2));
9046   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9047   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9048             "pminsw  $tmp,$src2\n\t"
9049             "movzwl  $dst,$src1\n\t"
9050             "pextrw  $tmp2,$tmp, 0x0\n\t"
9051             "pminsw  $dst,$tmp2\n\t"
9052             "pminsw  $dst,$tmp2\n\t"
9053             "movswl  $dst,$dst\t! min reduction4S" %}
9054   ins_encode %{
9055     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9056     __ pmaxsw($tmp$$XMMRegister, $src2$$XMMRegister);
9057     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
9058     __ movswl($tmp2$$Register, $tmp2$$Register);
9059     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9060     __ movswl($tmp3$$Register, $tmp3$$Register);
9061     __ cmpl($tmp2$$Register, $tmp3$$Register);
9062     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9063     __ cmpl($src1$$Register, $tmp3$$Register);
9064     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9065     __ movl($dst$$Register, $tmp3$$Register);
9066   %}
9067   ins_pipe( pipe_slow );
9068 %}
9069 
9070 instruct rvmax4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
9071   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9072   match(Set dst (MaxReductionV src1 src2));
9073   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9074   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9075             "pminsw  $tmp,$src2\n\t"
9076             "movzwl  $dst,$src1\n\t"
9077             "pextrw  $tmp2,$tmp, 0x0\n\t"
9078             "pminsw  $dst,$tmp2\n\t"
9079             "pminsw  $dst,$tmp2\n\t"
9080             "movswl  $dst,$dst\t! min reduction4S" %}
9081   ins_encode %{
9082     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9083     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
9084     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
9085     __ movswl($tmp2$$Register, $tmp2$$Register);
9086     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9087     __ movswl($tmp3$$Register, $tmp3$$Register);
9088     __ cmpl($tmp2$$Register, $tmp3$$Register);
9089     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9090     __ cmpl($src1$$Register, $tmp3$$Register);
9091     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9092     __ movl($dst$$Register, $tmp3$$Register);
9093   %}
9094   ins_pipe( pipe_slow );
9095 %}
9096 
9097 instruct rsmax8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9098   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9099   match(Set dst (MaxReductionV src1 src2));
9100   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9101   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
9102             "pmaxsw  $tmp2,$src2\n\t"
9103             "pshufd  $tmp,$tmp2,0x1\n\t"
9104             "pmaxsw  $tmp,$tmp2\n\t"
9105             "pextrw  $tmp2,$tmp\n\t"
9106             "movswl  $tmp2,$tmp2\n\t"
9107             "pextrw  $tmp3,$tmp, 0x0\n\t"
9108             "movswl  $tmp3,$tmp3\n\t"
9109             "cmpl    $tmp2,$tmp3\n\t"
9110             "cmovl  $tmp3,$tmp2\n\t"
9111             "cmpl  $src1,$tmp3\n\t"
9112             "cmovl  $tmp3,$src1\n\t"
9113             "movl  $dst,$tmp3\t! max reduction8S" %}
9114   ins_encode %{
9115     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister,0xE);
9116     __ pmaxsw($tmp2$$XMMRegister, $src2$$XMMRegister);
9117     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9118     __ pmaxsw($tmp$$XMMRegister, $tmp2$$XMMRegister);
9119     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9120     __ movswl($tmp4$$Register, $tmp4$$Register);
9121     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9122     __ movswl($tmp3$$Register, $tmp3$$Register);
9123     __ cmpl($tmp4$$Register, $tmp3$$Register);
9124     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9125     __ cmpl($src1$$Register, $tmp3$$Register);
9126     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9127     __ movl($dst$$Register, $tmp3$$Register);
9128   %}
9129   ins_pipe( pipe_slow );
9130 %}
9131 
9132 instruct rvmax8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9133   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9134   match(Set dst (MaxReductionV src1 src2));
9135   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9136   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9137             "vpmaxsw  $tmp,$tmp,$src2\n\t"
9138             "pshufd   $tmp2,$tmp,0x1\n\t"
9139             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9140             "movzwl   $dst,$src1\n\t"
9141             "pextrw   $tmp3,$tmp, 0x0\n\t"
9142             "vpmaxsw  $dst,$dst,$tmp3\n\t"
9143             "pextrw   $tmp3,$tmp, 0x1\n\t"
9144             "vpmaxsw  $dst,$dst,$tmp3\n\t"
9145             "movswl   $dst,$dst\t! max reduction8S" %}
9146   ins_encode %{
9147     int vector_len = 0;
9148     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9149     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9150     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9151     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9152     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9153     __ movswl($tmp4$$Register, $tmp4$$Register);
9154     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9155     __ movswl($tmp3$$Register, $tmp3$$Register);
9156     __ cmpl($tmp4$$Register, $tmp3$$Register);
9157     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9158     __ cmpl($src1$$Register, $tmp3$$Register);
9159     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9160     __ movl($dst$$Register, $tmp3$$Register);
9161   %}
9162   ins_pipe( pipe_slow );
9163 %}
9164 
9165 instruct rvmax16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9166   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9167   match(Set dst (MaxReductionV src1 src2));
9168   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9169   format %{ "vextracti128_high  $tmp,$src2\n\t"
9170             "vpmaxsw  $tmp,$tmp,$src2\n\t"
9171             "pshufd  $tmp2,$tmp,0xE\n\t"
9172             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9173             "pshufd  $tmp2,$tmp,0x1\n\t"
9174             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9175             "pextrw  $tmp2,$tmp, 0x1\n\t"
9176             "movswl  $tmp2,$tmp2\n\t"
9177             "pextrw  $tmp3,$tmp, 0x0\n\t"
9178             "movswl  $tmp3,$tmp3\n\t"
9179             "cmpl  $tmp2$tmp3\n\t"
9180             "cmovl  $tmp3,$tmp2\n\t"
9181             "cmpl  $src1,$tmp3\n\t"
9182             "cmovl  $tmp3,$src1\n\t"
9183             "movl  $dst,$tmp3\t! max reduction16S" %}
9184   ins_encode %{
9185     int vector_len = 1;
9186     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9187     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9188     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9189     __ vpmaxsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9190     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9191     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9192     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9193     __ movswl($tmp4$$Register, $tmp4$$Register);
9194     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9195     __ movswl($tmp3$$Register, $tmp3$$Register);
9196     __ cmpl($tmp4$$Register, $tmp3$$Register);
9197     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9198     __ cmpl($src1$$Register, $tmp3$$Register);
9199     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9200     __ movl($dst$$Register, $tmp3$$Register);
9201   %}
9202   ins_pipe( pipe_slow );
9203 %}
9204 
9205 instruct rvmax32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9206   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9207   match(Set dst (MaxReductionV src1 src2));
9208   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9209   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
9210             "vpmaxsw  $tmp2,$tmp2,$src2\n\t"
9211             "vextracti128_high  $tmp,$tmp2\n\t"
9212             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9213             "pshufd  $tmp2,$tmp,0xE\n\t"
9214             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9215             "pshufd  $tmp2,$tmp,0x1\n\t"
9216             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9217             "pextrw  $tmp3,$tmp, 0x0\n\t"
9218             "movswl  $dst,$src1\n\t"
9219             "pextrw  $tmp3,$tmp, 0x0\n\t"
9220             "movswl  $dst,$src1\n\t"
9221             "cmpl  $tmp2$tmp3\n\t"
9222             "cmovl  $tmp3,$tmp2\n\t"
9223             "cmpl  $src1,$tmp3\n\t"
9224             "cmovl  $tmp3,$src1\n\t"
9225             "movl  $dst,$dst\t! max reduction32S" %}
9226   ins_encode %{
9227     int vector_len = 2;
9228     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
9229     __ vpmaxsw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
9230     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
9231     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9232     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9233     __ vpmaxsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9234     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9235     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9236     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9237     __ movswl($tmp4$$Register, $tmp4$$Register);
9238     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9239     __ movswl($tmp3$$Register, $tmp3$$Register);
9240     __ cmpl($tmp4$$Register, $tmp3$$Register);
9241     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9242     __ cmpl($src1$$Register, $tmp3$$Register);
9243     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9244     __ movl($dst$$Register, $tmp3$$Register);
9245   %}
9246   ins_pipe( pipe_slow );
9247 %}
9248 
9249 instruct rsmax2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
9250   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9251   match(Set dst (MaxReductionV src1 src2));
9252   effect(TEMP tmp, TEMP tmp2);
9253   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9254             "pmaxsd  $tmp,$src2\n\t"
9255             "movd    $tmp2,$src1\n\t"
9256             "pmaxsd  $tmp2,$tmp\n\t"
9257             "movd    $dst,$tmp2\t! max reduction2I" %}
9258   ins_encode %{
9259     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9260     __ pmaxsd($tmp$$XMMRegister, $src2$$XMMRegister);
9261     __ movdl($tmp2$$XMMRegister, $src1$$Register);
9262     __ pmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister);
9263     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9264   %}
9265   ins_pipe( pipe_slow );
9266 %}
9267 
9268 instruct rvmax2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
9269   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9270   match(Set dst (MaxReductionV src1 src2));
9271   effect(TEMP tmp, TEMP tmp2);
9272   format %{ "pshufd   $tmp,$src2,0x1\n\t"
9273             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9274             "movd     $tmp,$src1\n\t"
9275             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9276             "movd     $dst,$tmp2\t! max reduction2I" %}
9277   ins_encode %{
9278     int vector_len = 0;
9279     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9280     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9281     __ movdl($tmp$$XMMRegister, $src1$$Register);
9282     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9283     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9284   %}
9285   ins_pipe( pipe_slow );
9286 %}
9287 
9288 instruct rsmax4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9289   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9290   match(Set dst (MaxReductionV src1 src2));
9291   effect(TEMP tmp, TEMP tmp2);
9292   format %{ "pshufd  $tmp,$src2,0xE\n\t"
9293             "pmaxsd  $tmp,$src2\n\t"
9294             "pshufd  $tmp2,$tmp,0x1\n\t"
9295             "pmaxsd  $tmp2,$tmp\n\t"
9296             "movd    $tmp,$src1\n\t"
9297             "pmaxsd  $tmp2,$tmp\n\t"
9298             "movd    $dst,$tmp2\t! max reduction4I" %}
9299   ins_encode %{
9300     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9301     __ pmaxsd($tmp$$XMMRegister, $src2$$XMMRegister);
9302     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
9303     __ pmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
9304     __ movdl($tmp$$XMMRegister, $src1$$Register);
9305     __ pmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
9306     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9307   %}
9308   ins_pipe( pipe_slow );
9309 %}
9310 
9311 instruct rvmax4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9312   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9313   match(Set dst (MaxReductionV src1 src2));
9314   effect(TEMP tmp, TEMP tmp2);
9315   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9316             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9317             "pshufd   $tmp,$tmp2,0x1\n\t"
9318             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9319             "movd     $tmp,$src1\n\t"
9320             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9321             "movd     $dst,$tmp2\t! max reduction4I" %}
9322   ins_encode %{
9323     int vector_len = 0;
9324     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9325     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9326     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9327     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9328     __ movdl($tmp$$XMMRegister, $src1$$Register);
9329     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9330     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9331   %}
9332   ins_pipe( pipe_slow );
9333 %}
9334 
9335 instruct rvmax4I_reduction_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9336   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9337   match(Set dst (MaxReductionV src1 src2));
9338   effect(TEMP tmp, TEMP tmp2);
9339   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9340             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9341             "pshufd   $tmp,$tmp2,0x1\n\t"
9342             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9343             "movd     $tmp,$src1\n\t"
9344             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9345             "movd     $dst,$tmp2\t! max reduction4I" %}
9346   ins_encode %{
9347     int vector_len = 0;
9348     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9349     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9350     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9351     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9352     __ movdl($tmp$$XMMRegister, $src1$$Register);
9353     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9354     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9355   %}
9356   ins_pipe( pipe_slow );
9357 %}
9358 
9359 instruct rvmax8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
9360   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9361   match(Set dst (MaxReductionV src1 src2));
9362   effect(TEMP tmp, TEMP tmp2);
9363   format %{ "vextracti128_high   $tmp,$src2\n\t"
9364             "vpmaxsd  $tmp,$tmp,$src2\n\t"
9365             "pshufd   $tmp2,$tmp,0xE\n\t"
9366             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9367             "pshufd   $tmp,$tmp2,0x1\n\t"
9368             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9369             "movd     $tmp,$src1\n\t"
9370             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9371             "movd     $dst,$tmp2\t! max reduction8I" %}
9372   ins_encode %{
9373     int vector_len = 1;
9374     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9375     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9376     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9377     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9378     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9379     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9380     __ movdl($tmp$$XMMRegister, $src1$$Register);
9381     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9382     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9383   %}
9384   ins_pipe( pipe_slow );
9385 %}
9386 
9387 instruct rvmax8I_reduction_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
9388   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9389   match(Set dst (MaxReductionV src1 src2));
9390   effect(TEMP tmp, TEMP tmp2);
9391   format %{ "vextracti128_high   $tmp,$src2\n\t"
9392             "vpmaxsd  $tmp,$tmp,$src2\n\t"
9393             "pshufd   $tmp2,$tmp,0xE\n\t"
9394             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9395             "pshufd   $tmp,$tmp2,0x1\n\t"
9396             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9397             "movd     $tmp,$src1\n\t"
9398             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9399             "movd     $dst,$tmp2\t! max reduction8I" %}
9400   ins_encode %{
9401     int vector_len = 1;
9402     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9403     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9404     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9405     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9406     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9407     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9408     __ movdl($tmp$$XMMRegister, $src1$$Register);
9409     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9410     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9411   %}
9412   ins_pipe( pipe_slow );
9413 %}
9414 
9415 instruct rvmax16I_reduction_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
9416   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9417   match(Set dst (MaxReductionV src1 src2));
9418   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9419   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
9420             "vpmaxsd  $tmp3,$tmp3,$src2\n\t"
9421             "vextracti128_high   $tmp,$tmp3\n\t"
9422             "vpmaxsd  $tmp,$tmp,$tmp3\n\t"
9423             "pshufd   $tmp2,$tmp,0xE\n\t"
9424             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9425             "pshufd   $tmp,$tmp2,0x1\n\t"
9426             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9427             "movd     $tmp,$src1\n\t"
9428             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9429             "movd     $dst,$tmp2\t! max reduction16I" %}
9430   ins_encode %{
9431     int vector_len = 2;
9432     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
9433     __ vpmaxsd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
9434     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
9435     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
9436     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9437     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9438     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9439     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9440     __ movdl($tmp$$XMMRegister, $src1$$Register);
9441     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9442     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9443   %}
9444   ins_pipe( pipe_slow );
9445 %}
9446 
9447 // Long Max Reduction
9448 instruct rsmax1L_reduction_reg(rRegL dst, rRegL src1, vecD src2, rxmm0 xmm_0, regF tmp2, regF tmp3) %{
9449   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9450   match(Set dst (MaxReductionV src1 src2));
9451   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
9452   format %{ "movdq      $xmm_0,$src1\n\t"
9453             "movdq      $tmp2,$src1\n\t"
9454             "pcmpgtq   $xmm_0,$src2\n\t"
9455             "blendvpd  $tmp2,$src2\n\t"
9456             "movdq      $dst,$tmp2\t! max reduction1L" %}
9457   ins_encode %{
9458     __ movdq($xmm_0$$XMMRegister,$src1$$Register);
9459     __ movdq($tmp2$$XMMRegister,$src1$$Register);
9460     __ movdq($tmp3$$XMMRegister,$src2$$Register);
9461     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9462     __ blendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister);
9463     __ movdq($dst$$Register, $tmp3$$XMMRegister);
9464   %}
9465   ins_pipe( pipe_slow );
9466 %}
9467 
9468 instruct rsmax2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, rxmm0 xmm_0, regF tmp2, regF tmp3) %{
9469   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9470   match(Set dst (MaxReductionV src1 src2));
9471   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
9472   format %{ "pshufd   $tmp3,$src2,0xE\n\t"
9473             "movdqu  $xmm_0,$src2\n\t"
9474             "pcmpgtq  $xmm_0,$tmp3\n\t"
9475             "blendvpd  $tmp3,$src2\n\t"
9476             "movdqu  $xmm_0,$tmp3\n\t"
9477             "movdq  $tmp2,$src1\n\t"
9478             "pcmpgtq  $xmm_0,$tmp2\n\t"
9479             "blendvpd  $tmp2,$tmp3\n\t"
9480             "movq     $dst,$tmp2\t! max reduction2L" %}
9481   ins_encode %{
9482     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 0xE);
9483     __ movdqu($xmm_0$$XMMRegister, $src2$$XMMRegister);
9484     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9485     __ blendvpd($tmp3$$XMMRegister, $src2$$XMMRegister);
9486     __ movdqu($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9487     __ movdq($tmp2$$XMMRegister, $src1$$Register);
9488     __ pcmpgtq($xmm_0$$XMMRegister, $tmp2$$XMMRegister);
9489     __ blendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister);
9490     __ movdq($dst$$Register, $tmp2$$XMMRegister);
9491   %}
9492   ins_pipe( pipe_slow );
9493 %}
9494 
9495 instruct rvmax2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2, regF tmp3) %{
9496   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9497   match(Set dst (MaxReductionV src1 src2));
9498   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9499   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
9500             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
9501             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
9502             "movq     $tmp,$src1\n\t"
9503             "vpcmpgtq  $tmp3,$tmp2,$tmp\n\t"
9504             "blendvpd   $tmp2,$tmp2,$src1,$tmp3\n\t"
9505             "movq     $dst,$tmp2\t! max reduction2L" %}
9506   ins_encode %{
9507     int vector_len = 0;
9508     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
9509     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
9510     __ vblendvpd($tmp2$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9511     __ movdq($tmp$$XMMRegister,$src1$$Register);
9512     __ vpcmpgtq($tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9513     __ vblendvpd($tmp2$$XMMRegister, $tmp$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, vector_len);
9514     __ movdq($dst$$Register, $tmp2$$XMMRegister);
9515   %}
9516   ins_pipe( pipe_slow );
9517 %}
9518 
9519 instruct rvmax4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
9520   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9521   match(Set dst (MaxReductionV src1 src2));
9522   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9523   format %{ "vextracti128_high   $tmp2,$src2\n\t"
9524             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
9525             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
9526             "vpshufd   $tmp3, $tmp2,0x1\n\t"
9527             "vpcmpgtq  $tmp, $tmp3,$tmp\n\t2"
9528             "vblendvpd $tmp3,$tmp3,$tmp2,$tmp\n\t"
9529             "movq     $tmp2,$src1\n\t"
9530             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
9531             "blendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
9532             "movq     $dst,$tmp2\t! max reduction2L" %}
9533   ins_encode %{
9534     int vector_len = 1;
9535     __ vextracti128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
9536     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
9537     __ vblendvpd($tmp2$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9538     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
9539     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9540     __ vblendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
9541     __ movdq($tmp$$XMMRegister,$src1$$Register);
9542     __ vpcmpgtq($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
9543     __ vblendvpd($tmp2$$XMMRegister, $tmp$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, vector_len);
9544     __ movdq($dst$$Register, $tmp2$$XMMRegister);
9545   %}
9546   ins_pipe( pipe_slow );
9547 %}
9548 
9549 instruct rvmax8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
9550   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9551   match(Set dst (MaxReductionV src1 src2));
9552   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9553   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
9554             "vpcmpgtq  $tmp,$tmp3,$src2\n\t"
9555             "vblendvpd   $tmp3,$tmp3,$src2,$tmp\n\t"
9556             "vextracti128_high   $tmp2,$tmp3\n\t"
9557             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
9558             "vblendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
9559             "vpshufd  $tmp3,$tmp2,0x1\n\t"
9560             "vpcmpgtq   $tmp,$tmp3,$tmp2\n\t"
9561             "vblendvpd  $tmp3,$tmp3,$tmp2,$tmp\n\t"
9562             "movq     $tmp2,$src1\n\t"
9563             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
9564             "vblendvpd  $tmp2,$tmp2,$tmp3,$tmp\n\t"
9565             "movq     $dst,$tmp2\t! max reduction4I" %}
9566   ins_encode %{
9567     int vector_len = 1;
9568     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
9569     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
9570     __ vblendvpd($tmp3$$XMMRegister, $src2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
9571     __ vextracti128_high($tmp2$$XMMRegister, $tmp3$$XMMRegister);
9572     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
9573     __ vblendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9574     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
9575     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9576     __ vblendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
9577     __ movdq($tmp2$$XMMRegister, $src1$$Register);
9578     __ vpcmpgtq($tmp$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
9579     __ vblendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9580     __ movdq($dst$$Register, $tmp2$$XMMRegister);
9581   %}
9582   ins_pipe( pipe_slow );
9583 %}
9584 
9585 // Float max Reduction
9586 instruct rsmax2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
9587   predicate(UseSSE > 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9588   match(Set dst (MaxReductionV dst src2));
9589   effect(TEMP dst, TEMP tmp);
9590   format %{ "maxps   $dst, $src2\n\t"
9591             "pshufd  $tmp,$src2,0x1\n\t"
9592             "maxps   $dst,$tmp\t! max reduction2F" %}
9593   ins_encode %{
9594     __ maxps($dst$$XMMRegister, $src2$$XMMRegister);
9595     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9596     __ maxps($dst$$XMMRegister, $tmp$$XMMRegister);
9597   %}
9598   ins_pipe( pipe_slow );
9599 %}
9600 
9601 instruct rvmax2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
9602   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9603   match(Set dst (MaxReductionV dst src2));
9604   effect(TEMP dst, TEMP tmp);
9605   format %{ "vmaxps  $dst,$dst,$src2\n\t"
9606             "pshufd  $tmp,$src2,0x1\n\t"
9607             "vmaxps  $dst,$dst,$tmp\t! max reduction2F" %}
9608   ins_encode %{
9609     int vector_len = 0;
9610     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9611     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9612     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9613   %}
9614   ins_pipe( pipe_slow );
9615 %}
9616 
9617 instruct rsmax4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
9618   predicate(UseSSE > 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9619   match(Set dst (MaxReductionV dst src2));
9620   effect(TEMP tmp, TEMP dst);
9621   format %{ "maxps  $dst,$src2\n\t"
9622             "pshufd  $tmp,$src2,0x1\n\t"
9623             "maxps  $dst,tmp\n\t"
9624             "pshufd  $tmp,$src2,0x2\n\t"
9625             "maxps  $dst,tmp\n\t"
9626             "pshufd  $tmp,$src2,0x3\n\t"
9627             "maxps  $dst,$tmp\t! max reduction4F" %}
9628   ins_encode %{
9629     __ maxps($dst$$XMMRegister, $src2$$XMMRegister);
9630     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9631     __ maxps($dst$$XMMRegister, $tmp$$XMMRegister);
9632     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x2);
9633     __ maxps($dst$$XMMRegister, $tmp$$XMMRegister);
9634     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x3);
9635     __ maxps($dst$$XMMRegister, $tmp$$XMMRegister);
9636   %}
9637   ins_pipe( pipe_slow );
9638 %}
9639 
9640 instruct rvmax4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
9641   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9642   match(Set dst (MaxReductionV dst src2));
9643   effect(TEMP tmp, TEMP dst);
9644   format %{ "vmaxps  $dst,$dst,$src2\n\t"
9645             "pshufd  $tmp,$src2,0x1\n\t"
9646             "vmaxps  $dst,$dst,tmp\n\t"
9647             "pshufd  $tmp,$src2,0x2\n\t"
9648             "vmaxps  $dst,$dst,tmp\n\t"
9649             "pshufd  $tmp,$src2,0x3\n\t"
9650             "vmaxps  $dst,$dst,$tmp\t! max reduction4F" %}
9651   ins_encode %{
9652     int vector_len = 0;
9653     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9654     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9655     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9656     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x2);
9657     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9658     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x3);
9659     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9660   %}
9661   ins_pipe( pipe_slow );
9662 %}
9663 
9664 instruct rvmax8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
9665   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9666   match(Set dst (MaxReductionV dst src2));
9667   effect(TEMP tmp, TEMP dst, TEMP tmp2);
9668   format %{ "vmaxps  $dst,$dst,$src2\n\t"
9669             "pshufd  $tmp,$src2,0x01\n\t"
9670             "vmaxps  $dst,$dst,$tmp\n\t"
9671             "pshufd  $tmp,$src2,0x02\n\t"
9672             "vmaxps  $dst,$dst,$tmp\n\t"
9673             "pshufd  $tmp,$src2,0x03\n\t"
9674             "vmaxps  $dst,$dst,$tmp\n\t"
9675             "vextractf128_high  $tmp2,$src2\n\t"
9676             "vmaxps  $dst,$dst,$tmp2\n\t"
9677             "pshufd  $tmp,$tmp2,0x01\n\t"
9678             "vmaxps  $dst,$dst,$tmp\n\t"
9679             "pshufd  $tmp,$tmp2,0x02\n\t"
9680             "vmaxps  $dst,$dst,$tmp\n\t"
9681             "pshufd  $tmp,$tmp2,0x03\n\t"
9682             "vmaxps  $dst,$dst,$tmp\t! sub reduction8F" %}
9683   ins_encode %{
9684     int vector_len = 1;
9685     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9686     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
9687     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9688     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
9689     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9690     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
9691     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9692     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
9693     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9694     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
9695     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9696     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
9697     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9698     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
9699     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9700   %}
9701   ins_pipe( pipe_slow );
9702 %}
9703 
9704 instruct rvmax16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
9705   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9706   match(Set dst (MaxReductionV dst src2));
9707   effect(TEMP tmp, TEMP dst, TEMP tmp2);
9708   format %{ "vmaxps  $dst,$dst,$src2\n\t"
9709             "pshufd  $tmp,$src2,0x01\n\t"
9710             "vmaxps  $dst,$dst,$tmp\n\t"
9711             "pshufd  $tmp,$src2,0x02\n\t"
9712             "vmaxps  $dst,$dst,$tmp\n\t"
9713             "pshufd  $tmp,$src2,0x03\n\t"
9714             "vmaxps  $dst,$dst,$tmp\n\t"
9715             "vextractf32x4  $tmp2,$src2,0x1\n\t"
9716             "vmaxps  $dst,$dst,$tmp2\n\t"
9717             "pshufd  $tmp,$tmp2,0x01\n\t"
9718             "vmaxps  $dst,$dst,$tmp\n\t"
9719             "pshufd  $tmp,$tmp2,0x02\n\t"
9720             "vmaxps  $dst,$dst,$tmp\n\t"
9721             "pshufd  $tmp,$tmp2,0x03\n\t"
9722             "vmaxps  $dst,$dst,$tmp\n\t"
9723             "vextractf32x4  $tmp2,$src2,0x2\n\t"
9724             "vmaxps  $dst,$dst,$tmp2\n\t"
9725             "pshufd  $tmp,$tmp2,0x01\n\t"
9726             "vmaxps  $dst,$dst,$tmp\n\t"
9727             "pshufd  $tmp,$tmp2,0x02\n\t"
9728             "vmaxps  $dst,$dst,$tmp\n\t"
9729             "pshufd  $tmp,$tmp2,0x03\n\t"
9730             "vmaxps  $dst,$dst,$tmp\n\t"
9731             "vextractf32x4  $tmp2,$src2,0x3\n\t"
9732             "vmaxps  $dst,$dst,$tmp2\n\t"
9733             "pshufd  $tmp,$tmp2,0x01\n\t"
9734             "vmaxps  $dst,$dst,$tmp\n\t"
9735             "pshufd  $tmp,$tmp2,0x02\n\t"
9736             "vmaxps  $dst,$dst,$tmp\n\t"
9737             "pshufd  $tmp,$tmp2,0x03\n\t"
9738             "vmaxps  $dst,$dst,$tmp\t! sub reduction16F" %}
9739   ins_encode %{
9740     int vector_len = 2;
9741     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9742     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
9743     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9744     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
9745     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9746     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
9747     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9748     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
9749     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9750     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
9751     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9752     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
9753     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9754     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
9755     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9756     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
9757     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9758     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
9759     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9760     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
9761     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9762     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
9763     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9764     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
9765     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9766     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
9767     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9768     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
9769     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9770     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
9771     __ vmaxps($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9772   %}
9773   ins_pipe( pipe_slow );
9774 %}
9775 
9776 instruct rsmax2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
9777   predicate(UseSSE >= 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9778   match(Set dst (MaxReductionV dst src2));
9779   effect(TEMP tmp, TEMP dst);
9780   format %{ "maxpd   $dst,$src2\n\t"
9781             "pshufd  $tmp,$src2,0xE\n\t"
9782             "maxpd   $dst,$tmp\t! max reduction2D" %}
9783   ins_encode %{
9784     __ maxpd($dst$$XMMRegister, $src2$$XMMRegister);
9785     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9786     __ maxpd($dst$$XMMRegister, $tmp$$XMMRegister);
9787   %}
9788   ins_pipe( pipe_slow );
9789 %}
9790 
9791 instruct rvmax2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
9792   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9793   match(Set dst (MaxReductionV dst src2));
9794   effect(TEMP tmp, TEMP dst);
9795   format %{ "vmaxpd  $dst,$dst,$src2\n\t"
9796             "pshufd  $tmp,$src2,0xE\n\t"
9797             "vmaxpd  $dst,$dst,$tmp\t! max reduction2D" %}
9798   ins_encode %{
9799     int vector_len = 0;
9800     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9801     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9802     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9803   %}
9804   ins_pipe( pipe_slow );
9805 %}
9806 
9807 instruct rvmax4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
9808   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9809   match(Set dst (MaxReductionV dst src2));
9810   effect(TEMP tmp, TEMP dst, TEMP tmp2);
9811   format %{ "vmaxpd  $dst,$dst,$src2\n\t"
9812             "pshufd  $tmp,$src2,0xE\n\t"
9813             "vmaxpd  $dst,$dst,$tmp\n\t"
9814             "vextractf32x4  $tmp2,$src2,0x1\n\t"
9815             "vmaxpd  $dst,$dst,$tmp2\n\t"
9816             "pshufd  $tmp,$tmp2,0xE\n\t"
9817             "vmaxpd  $dst,$dst,$tmp\t! max reduction4D" %}
9818   ins_encode %{
9819     int vector_len = 1;
9820     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9821     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9822     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9823     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
9824     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9825     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
9826     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9827   %}
9828   ins_pipe( pipe_slow );
9829 %}
9830 
9831 instruct rvmax8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
9832   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9833   match(Set dst (MaxReductionV dst src2));
9834   effect(TEMP tmp, TEMP dst, TEMP tmp2);
9835   format %{ "vmaxpd  $dst,$dst,$src2\n\t"
9836             "pshufd  $tmp,$src2,0xE\n\t"
9837             "vmaxpd  $dst,$dst,$tmp\n\t"
9838             "vextractf32x4  $tmp2,$src2,0x1\n\t"
9839             "vmaxpd  $dst,$dst,$tmp2\n\t"
9840             "pshufd  $tmp,$tmp2,0xE\n\t"
9841             "vmaxpd  $dst,$dst,$tmp\n\t"
9842             "vextractf32x4  $tmp2,$src2,0x2\n\t"
9843             "vmaxpd  $dst,$dst,$tmp2\n\t"
9844             "pshufd  $tmp,$tmp2,0xE\n\t"
9845             "vmaxpd  $dst,$dst,$tmp\n\t"
9846             "vextractf32x4  $tmp2,$src2,0x3\n\t"
9847             "vmaxpd  $dst,$dst,$tmp2\n\t"
9848             "pshufd  $tmp,$tmp2,0xE\n\t"
9849             "vmaxpd  $dst,$dst,$tmp\t! max reduction8D" %}
9850   ins_encode %{
9851     int vector_len = 2;
9852     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister, vector_len);
9853     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9854     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9855     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
9856     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9857     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
9858     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9859     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
9860     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9861     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
9862     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9863     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
9864     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9865     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
9866     __ vmaxpd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9867   %}
9868   ins_pipe( pipe_slow );
9869 %}
9870 
9871 instruct rsand8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
9872   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9873   match(Set dst (AndReductionV src1 src2));
9874   effect(TEMP tmp, TEMP tmp2, TEMP dst);
9875   format %{
9876             "pshufd  $tmp,$src2,0x1\n\t"
9877             "pand    $tmp,$src2\n\t"
9878             "movzbl  $dst,$src1\n\t"
9879             "pextrb  $tmp2,$tmp, 0x0\n\t"
9880             "andl    $dst,$tmp2\n\t"
9881             "pextrb  $tmp2,$tmp, 0x1\n\t"
9882             "andl    $dst,$tmp2\n\t"
9883             "pextrb  $tmp2,$tmp, 0x2\n\t"
9884             "andl    $dst,$tmp2\n\t"
9885             "pextrb  $tmp2,$tmp, 0x3\n\t"
9886             "andl    $dst,$tmp2\n\t"
9887             "movsbl  $dst,$dst\t! and reduction8B" %}
9888   ins_encode %{
9889     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9890     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
9891     __ movzbl($dst$$Register, $src1$$Register);
9892     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
9893     __ andl($dst$$Register, $tmp2$$Register);
9894     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
9895     __ andl($dst$$Register, $tmp2$$Register);
9896     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
9897     __ andl($dst$$Register, $tmp2$$Register);
9898     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
9899     __ andl($dst$$Register, $tmp2$$Register);
9900     __ movsbl($dst$$Register, $dst$$Register);
9901   %}
9902   ins_pipe( pipe_slow );
9903 %}
9904 
9905 instruct rsand16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
9906   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9907   match(Set dst (AndReductionV src1 src2));
9908   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
9909   format %{ "pshufd  $tmp,$src2,0xE\n\t"
9910             "pand    $tmp,$src2\n\t"
9911             "pshufd  $tmp2,$tmp,0x1\n\t"
9912             "pand    $tmp,$tmp,$tmp2\n\t"
9913             "movzbl  $dst,$src1\n\t"
9914             "pextrb  $tmp3,$tmp, 0x0\n\t"
9915             "andl    $dst,$tmp3\n\t"
9916             "pextrb  $tmp3,$tmp, 0x1\n\t"
9917             "andl    $dst,$tmp3\n\t"
9918             "pextrb  $tmp3,$tmp, 0x2\n\t"
9919             "andl    $dst,$tmp3\n\t"
9920             "pextrb  $tmp3,$tmp, 0x3\n\t"
9921             "andl    $dst,$tmp3\n\t"
9922             "movsbl  $dst,$dst\t! and reduction16B" %}
9923   ins_encode %{
9924     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9925     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
9926     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
9927     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
9928     __ movzbl($dst$$Register, $src1$$Register);
9929     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
9930     __ andl($dst$$Register, $tmp3$$Register);
9931     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
9932     __ andl($dst$$Register, $tmp3$$Register);
9933     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
9934     __ andl($dst$$Register, $tmp3$$Register);
9935     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
9936     __ andl($dst$$Register, $tmp3$$Register);
9937     __ movsbl($dst$$Register, $dst$$Register);
9938   %}
9939   ins_pipe( pipe_slow );
9940 %}
9941 
9942 instruct rvand32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
9943   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9944   match(Set dst (AndReductionV src1 src2));
9945   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
9946    format %{ "vextracti128_high  $tmp,$src2\n\t"
9947             "vpand   $tmp,$tmp,$src2\n\t"
9948             "pshufd  $tmp2,$tmp,0xE\n\t"
9949             "vpand   $tmp,$tmp,$tmp2\n\t"
9950             "pshufd  $tmp2,$tmp,0x1\n\t"
9951             "vpand   $tmp,$tmp,$tmp2\n\t"
9952             "movzbl  $dst,$src1\n\t"
9953             "pextrb  $tmp3,$tmp, 0x0\n\t"
9954             "andl    $dst,$tmp3\n\t"
9955             "pextrb  $tmp3,$tmp, 0x1\n\t"
9956             "andl    $dst,$tmp3\n\t"
9957             "pextrb  $tmp3,$tmp, 0x2\n\t"
9958             "andl    $dst,$tmp3\n\t"
9959             "pextrb  $tmp3,$tmp, 0x3\n\t"
9960             "andl    $dst,$tmp3\n\t"
9961             "movsbl  $dst,$dst\t! and reduction32B" %}
9962   ins_encode %{
9963     int vector_len = 0;
9964     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9965     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9966     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
9967     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9968     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
9969     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9970     __ movzbl($dst$$Register, $src1$$Register);
9971     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
9972     __ andl($dst$$Register, $tmp3$$Register);
9973     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
9974     __ andl($dst$$Register, $tmp3$$Register);
9975     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
9976     __ andl($dst$$Register, $tmp3$$Register);
9977     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
9978     __ andl($dst$$Register, $tmp3$$Register);
9979     __ movsbl($dst$$Register, $dst$$Register);
9980   %}
9981   ins_pipe( pipe_slow );
9982 %}
9983 
9984 instruct rvand64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
9985   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9986   match(Set dst (AndReductionV src1 src2));
9987   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
9988   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
9989             "vpand   $tmp2,$tmp2,$src2\n\t"
9990             "vextracti128_high  $tmp,$tmp2\n\t"
9991             "vpand   $tmp,$tmp,$tmp2\n\t"
9992             "pshufd  $tmp2,$tmp,0xE\n\t"
9993             "vpand   $tmp,$tmp,$tmp2\n\t"
9994             "pshufd  $tmp2,$tmp,0x1\n\t"
9995             "vpand   $tmp,$tmp,$tmp2\n\t"
9996             "movzbl  $dst,$src1\n\t"
9997             "movdl   $tmp3,$tmp\n\t"
9998             "andl    $dst,$tmp3\n\t"
9999             "shrl    $tmp3,0x8\n\t"
10000             "andl    $dst,$tmp3\n\t"
10001             "shrl    $tmp3,0x8\n\t"
10002             "andl    $dst,$tmp3\n\t"
10003             "shrl    $tmp3,0x8\n\t"
10004             "andl    $dst,$tmp3\n\t"
10005             "movsbl  $dst,$dst\t! and reduction64B" %}
10006   ins_encode %{
10007     int vector_len = 0;
10008     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10009     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10010     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10011     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10012     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10013     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10014     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10015     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10016     __ movzbl($dst$$Register, $src1$$Register);
10017     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10018     __ andl($dst$$Register, $tmp3$$Register);
10019     __ shrl($tmp3$$Register, 8);
10020     __ andl($dst$$Register, $tmp3$$Register);
10021     __ shrl($tmp3$$Register, 8);
10022     __ andl($dst$$Register, $tmp3$$Register);
10023     __ shrl($tmp3$$Register, 8);
10024     __ andl($dst$$Register, $tmp3$$Register);
10025     __ movsbl($dst$$Register, $dst$$Register);
10026   %}
10027   ins_pipe( pipe_slow );
10028 %}
10029 
10030 instruct rsand4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10031   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10032   match(Set dst (AndReductionV src1 src2));
10033   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10034   format %{
10035             "pshufd  $tmp,$src2,0x1\n\t"
10036             "pand    $tmp,$src2\n\t"
10037             "movzwl  $dst,$src1\n\t"
10038             "pextrw  $tmp2,$tmp, 0x0\n\t"
10039             "andw    $dst,$tmp2\n\t"
10040             "pextrw  $tmp2,$tmp, 0x1\n\t"
10041             "andw    $dst,$tmp2\n\t"
10042             "movswl  $dst,$dst\t! and reduction4S" %}
10043   ins_encode %{
10044     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10045     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10046     __ movzwl($dst$$Register, $src1$$Register);
10047     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10048     __ andw($dst$$Register, $tmp2$$Register);
10049     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10050     __ andw($dst$$Register, $tmp2$$Register);
10051     __ movswl($dst$$Register, $dst$$Register);
10052   %}
10053   ins_pipe( pipe_slow );
10054 %}
10055 
10056 instruct rsand8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10057   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10058   match(Set dst (AndReductionV src1 src2));
10059   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10060   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10061             "pand    $tmp,$src2\n\t"
10062             "pshufd  $tmp2,$tmp,0x1\n\t"
10063             "pand    $tmp,$tmp,$tmp2\n\t"
10064             "movzwl  $dst,$src1\n\t"
10065             "pextrw  $tmp3,$tmp, 0x0\n\t"
10066             "andw    $dst,$tmp3\n\t"
10067             "pextrw  $tmp3,$tmp, 0x1\n\t"
10068             "andw    $dst,$tmp3\n\t"
10069             "movswl  $dst,$dst\t! and reduction8S" %}
10070   ins_encode %{
10071     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10072     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10073     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10074     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
10075     __ movzwl($dst$$Register, $src1$$Register);
10076     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10077     __ andw($dst$$Register, $tmp3$$Register);
10078     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10079     __ andw($dst$$Register, $tmp3$$Register);
10080     __ movswl($dst$$Register, $dst$$Register);
10081   %}
10082   ins_pipe( pipe_slow );
10083 %}
10084 
10085 instruct rvand16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10086   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10087   match(Set dst (AndReductionV src1 src2));
10088   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10089    format %{ "vextracti128_high  $tmp,$src2\n\t"
10090             "vpand   $tmp,$tmp,$src2\n\t"
10091             "pshufd  $tmp2,$tmp,0xE\n\t"
10092             "vpand   $tmp,$tmp,$tmp2\n\t"
10093             "pshufd  $tmp2,$tmp,0x1\n\t"
10094             "vpand   $tmp,$tmp,$tmp2\n\t"
10095             "movzwl  $dst,$src1\n\t"
10096             "pextrw  $tmp3,$tmp, 0x0\n\t"
10097             "andw    $dst,$tmp3\n\t"
10098             "pextrw  $tmp3,$tmp, 0x1\n\t"
10099             "andw    $dst,$tmp3\n\t"
10100             "movswl  $dst,$dst\t! and reduction16S" %}
10101   ins_encode %{
10102     int vector_len = 0;
10103     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10104     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10105     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10106     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10107     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10108     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10109     __ movzwl($dst$$Register, $src1$$Register);
10110     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10111     __ andw($dst$$Register, $tmp3$$Register);
10112     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10113     __ andw($dst$$Register, $tmp3$$Register);
10114     __ movswl($dst$$Register, $dst$$Register);
10115   %}
10116   ins_pipe( pipe_slow );
10117 %}
10118 
10119 instruct rvand32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10120   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10121   match(Set dst (AndReductionV src1 src2));
10122   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10123   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10124             "vpand   $tmp2,$tmp2,$src2\n\t"
10125             "vextracti128_high  $tmp,$tmp2\n\t"
10126             "vpand   $tmp,$tmp,$tmp2\n\t"
10127             "pshufd  $tmp2,$tmp,0xE\n\t"
10128             "vpand   $tmp,$tmp,$tmp2\n\t"
10129             "pshufd  $tmp2,$tmp,0x1\n\t"
10130             "vpand   $tmp,$tmp,$tmp2\n\t"
10131             "movzwl  $dst,$src1\n\t"
10132             "movdl   $tmp3,$tmp\n\t"
10133             "andw    $dst,$tmp3\n\t"
10134             "shrl    $tmp3,0x16\n\t"
10135             "andw    $dst,$tmp3\n\t"
10136             "movswl  $dst,$dst\t! and reduction32S" %}
10137   ins_encode %{
10138     int vector_len = 0;
10139     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10140     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10141     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10142     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10143     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10144     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10145     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10146     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10147     __ movzwl($dst$$Register, $src1$$Register);
10148     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10149     __ andw($dst$$Register, $tmp3$$Register);
10150     __ shrl($tmp3$$Register, 16);
10151     __ andw($dst$$Register, $tmp3$$Register);
10152     __ movswl($dst$$Register, $dst$$Register);
10153   %}
10154   ins_pipe( pipe_slow );
10155 %}
10156 
10157 instruct rsand2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
10158   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10159   match(Set dst (AndReductionV src1 src2));
10160   effect(TEMP tmp, TEMP tmp2);
10161   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
10162             "pand    $tmp2,$src2\n\t"
10163             "movd    $tmp,$src1\n\t"
10164             "pand    $tmp2,$tmp\n\t"
10165             "movd    $dst,$tmp2\t! and reduction2I" %}
10166   ins_encode %{
10167     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
10168     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10169     __ movdl($tmp$$XMMRegister, $src1$$Register);
10170     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10171     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10172   %}
10173   ins_pipe( pipe_slow );
10174 %}
10175 
10176 instruct rsand4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
10177   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10178   match(Set dst (AndReductionV src1 src2));
10179   effect(TEMP tmp, TEMP tmp2);
10180   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10181             "pand    $tmp2,$src2\n\t"
10182             "pshufd  $tmp,$tmp2,0x1\n\t"
10183             "pand    $tmp2,$tmp\n\t"
10184             "movd    $tmp,$src1\n\t"
10185             "pand    $tmp2,$tmp\n\t"
10186             "movd    $dst,$tmp2\t! and reduction4I" %}
10187   ins_encode %{
10188     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10189     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10190     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
10191     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10192     __ movdl($tmp$$XMMRegister, $src1$$Register);
10193     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10194     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10195   %}
10196   ins_pipe( pipe_slow );
10197 %}
10198 
10199 instruct rvand8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
10200   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10201   match(Set dst (AndReductionV src1 src2));
10202   effect(TEMP tmp, TEMP tmp2);
10203   format %{ "vextracti128_high  $tmp,$src2\n\t"
10204             "vpand    $tmp,$tmp,$src2\n\t"
10205             "vpshufd   $tmp2,$tmp,0xE\n\t"
10206             "vpand    $tmp,$tmp,$tmp2\n\t"
10207             "vpshufd   $tmp2,$tmp,0x1\n\t"
10208             "vpand    $tmp,$tmp,$tmp2\n\t"
10209             "movd     $tmp2,$src1\n\t"
10210             "vpand    $tmp2,$tmp,$tmp2\n\t"
10211             "movd     $dst,$tmp2\t! and reduction8I" %}
10212   ins_encode %{
10213     int vector_len = 0;
10214     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10215     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10216     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10217     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10218     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10219     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10220     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10221     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10222     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10223   %}
10224   ins_pipe( pipe_slow );
10225 %}
10226 
10227 instruct rvand16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
10228   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10229   match(Set dst (AndReductionV src1 src2));
10230   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10231   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
10232             "vpand  $tmp3,$tmp3,$src2\n\t"
10233             "vextracti128_high  $tmp,$tmp3\n\t"
10234             "vpand    $tmp,$tmp,$src2\n\t"
10235             "vpshufd   $tmp2,$tmp,0xE\n\t"
10236             "vpand    $tmp,$tmp,$tmp2\n\t"
10237             "vpshufd   $tmp2,$tmp,0x1\n\t"
10238             "vpand    $tmp,$tmp,$tmp2\n\t"
10239             "movd     $tmp2,$src1\n\t"
10240             "vpand    $tmp2,$tmp,$tmp2\n\t"
10241             "movd     $dst,$tmp2\t! and reduction16I" %}
10242   ins_encode %{
10243     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
10244     __ vpand($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
10245     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
10246     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
10247     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, 0);
10248     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10249     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, 0);
10250     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10251     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10252     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10253     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10254   %}
10255   ins_pipe( pipe_slow );
10256 %}
10257 
10258 #ifdef _LP64
10259 instruct rsand2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
10260   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10261   match(Set dst (AndReductionV src1 src2));
10262   effect(TEMP tmp, TEMP tmp2);
10263   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10264             "pand    $tmp2,$src2\n\t"
10265             "movdq   $tmp,$src1\n\t"
10266             "pand    $tmp2,$tmp\n\t"
10267             "movq   $dst,$tmp2\t! and reduction2L" %}
10268   ins_encode %{
10269     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10270     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10271     __ movdq($tmp$$XMMRegister, $src1$$Register);
10272     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10273     __ movq($dst$$Register, $tmp2$$XMMRegister);
10274   %}
10275   ins_pipe( pipe_slow );
10276 %}
10277 
10278 instruct rvand4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
10279   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10280   match(Set dst (AndReductionV src1 src2));
10281   effect(TEMP tmp, TEMP tmp2);
10282   format %{ "vextracti128_high  $tmp,$src2\n\t"
10283             "vpand  $tmp2,$tmp,$src2\n\t"
10284             "vpshufd  $tmp,$tmp2,0xE\n\t"
10285             "vpand  $tmp2,$tmp2,$tmp\n\t"
10286             "movq   $tmp,$src1\n\t"
10287             "vpand  $tmp2,$tmp2,$tmp\n\t"
10288             "movq   $dst,$tmp2\t! and reduction4L" %}
10289   ins_encode %{
10290     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10291     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
10292     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
10293     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10294     __ movq($tmp$$XMMRegister, $src1$$Register);
10295     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10296     __ movq($dst$$Register, $tmp2$$XMMRegister);
10297   %}
10298   ins_pipe( pipe_slow );
10299 %}
10300 
10301 instruct rvand8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
10302   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10303   match(Set dst (AndReductionV src1 src2));
10304   effect(TEMP tmp, TEMP tmp2);
10305   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10306             "vpandq  $tmp2,$tmp2,$src2\n\t"
10307             "vextracti128_high  $tmp,$tmp2\n\t"
10308             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10309             "vpshufd  $tmp,$tmp2,0xE\n\t"
10310             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10311             "movdq   $tmp,$src1\n\t"
10312             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10313             "movdq   $dst,$tmp2\t! and reduction8L" %}
10314   ins_encode %{
10315     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10316     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10317     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10318     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10319     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
10320     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10321     __ movdq($tmp$$XMMRegister, $src1$$Register);
10322     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10323     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10324   %}
10325   ins_pipe( pipe_slow );
10326 %}
10327 #endif
10328 
10329 instruct rsor8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10330   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10331   match(Set dst (OrReductionV src1 src2));
10332   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10333   format %{
10334             "pshufd  $tmp,$src2,0x1\n\t"
10335             "por    $tmp,$src2\n\t"
10336             "movzbl  $dst,$src1\n\t"
10337             "pextrb  $tmp2,$tmp, 0x0\n\t"
10338             "orl    $dst,$tmp2\n\t"
10339             "pextrb  $tmp2,$tmp, 0x1\n\t"
10340             "orl    $dst,$tmp2\n\t"
10341             "pextrb  $tmp2,$tmp, 0x2\n\t"
10342             "orl    $dst,$tmp2\n\t"
10343             "pextrb  $tmp2,$tmp, 0x3\n\t"
10344             "orl    $dst,$tmp2\n\t"
10345             "movsbl  $dst,$dst\t! or reduction8B" %}
10346   ins_encode %{
10347     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10348     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10349     __ movzbl($dst$$Register, $src1$$Register);
10350     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10351     __ orl($dst$$Register, $tmp2$$Register);
10352     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10353     __ orl($dst$$Register, $tmp2$$Register);
10354     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
10355     __ orl($dst$$Register, $tmp2$$Register);
10356     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
10357     __ orl($dst$$Register, $tmp2$$Register);
10358     __ movsbl($dst$$Register, $dst$$Register);
10359   %}
10360   ins_pipe( pipe_slow );
10361 %}
10362 
10363 instruct rsor16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10364   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10365   match(Set dst (OrReductionV src1 src2));
10366   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10367   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10368             "por    $tmp,$src2\n\t"
10369             "pshufd  $tmp2,$tmp,0x1\n\t"
10370             "por    $tmp,$tmp,$tmp2\n\t"
10371             "movzbl  $dst,$src1\n\t"
10372             "pextrb  $tmp3,$tmp, 0x0\n\t"
10373             "orl    $dst,$tmp3\n\t"
10374             "pextrb  $tmp3,$tmp, 0x1\n\t"
10375             "orl    $dst,$tmp3\n\t"
10376             "pextrb  $tmp3,$tmp, 0x2\n\t"
10377             "orl    $dst,$tmp3\n\t"
10378             "pextrb  $tmp3,$tmp, 0x3\n\t"
10379             "orl    $dst,$tmp3\n\t"
10380             "movsbl  $dst,$dst\t! or reduction16B" %}
10381   ins_encode %{
10382     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10383     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10384     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10385     __ por($tmp$$XMMRegister, $tmp2$$XMMRegister);
10386     __ movzbl($dst$$Register, $src1$$Register);
10387     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10388     __ orl($dst$$Register, $tmp3$$Register);
10389     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10390     __ orl($dst$$Register, $tmp3$$Register);
10391     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10392     __ orl($dst$$Register, $tmp3$$Register);
10393     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10394     __ orl($dst$$Register, $tmp3$$Register);
10395     __ movsbl($dst$$Register, $dst$$Register);
10396   %}
10397   ins_pipe( pipe_slow );
10398 %}
10399 
10400 instruct rvor32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10401   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10402   match(Set dst (OrReductionV src1 src2));
10403   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10404    format %{ "vextracti128_high  $tmp,$src2\n\t"
10405             "vpor   $tmp,$tmp,$src2\n\t"
10406             "pshufd  $tmp2,$tmp,0xE\n\t"
10407             "vpor   $tmp,$tmp,$tmp2\n\t"
10408             "pshufd  $tmp2,$tmp,0x1\n\t"
10409             "vpor   $tmp,$tmp,$tmp2\n\t"
10410             "movzbl  $dst,$src1\n\t"
10411             "pextrb  $tmp3,$tmp, 0x0\n\t"
10412             "orl    $dst,$tmp3\n\t"
10413             "pextrb  $tmp3,$tmp, 0x1\n\t"
10414             "orl    $dst,$tmp3\n\t"
10415             "pextrb  $tmp3,$tmp, 0x2\n\t"
10416             "orl    $dst,$tmp3\n\t"
10417             "pextrb  $tmp3,$tmp, 0x3\n\t"
10418             "orl    $dst,$tmp3\n\t"
10419             "movsbl  $dst,$dst\t! or reduction32B" %}
10420   ins_encode %{
10421     int vector_len = 0;
10422     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10423     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10424     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10425     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10426     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10427     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10428     __ movzbl($dst$$Register, $src1$$Register);
10429     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10430     __ orl($dst$$Register, $tmp3$$Register);
10431     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10432     __ orl($dst$$Register, $tmp3$$Register);
10433     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10434     __ orl($dst$$Register, $tmp3$$Register);
10435     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10436     __ orl($dst$$Register, $tmp3$$Register);
10437     __ movsbl($dst$$Register, $dst$$Register);
10438   %}
10439   ins_pipe( pipe_slow );
10440 %}
10441 
10442 instruct rvor64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10443   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10444   match(Set dst (OrReductionV src1 src2));
10445   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10446   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10447             "vpor   $tmp2,$tmp2,$src2\n\t"
10448             "vextracti128_high  $tmp,$tmp2\n\t"
10449             "vpor   $tmp,$tmp,$tmp2\n\t"
10450             "pshufd  $tmp2,$tmp,0xE\n\t"
10451             "vpor   $tmp,$tmp,$tmp2\n\t"
10452             "pshufd  $tmp2,$tmp,0x1\n\t"
10453             "vpor   $tmp,$tmp,$tmp2\n\t"
10454             "movzbl  $dst,$src1\n\t"
10455             "movdl   $tmp3,$tmp\n\t"
10456             "orl    $dst,$tmp3\n\t"
10457             "shrl    $tmp3,0x8\n\t"
10458             "orl    $dst,$tmp3\n\t"
10459             "shrl    $tmp3,0x8\n\t"
10460             "orl    $dst,$tmp3\n\t"
10461             "shrl    $tmp3,0x8\n\t"
10462             "orl    $dst,$tmp3\n\t"
10463             "movsbl  $dst,$dst\t! or reduction64B" %}
10464   ins_encode %{
10465     int vector_len = 0;
10466     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10467     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10468     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10469     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10470     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10471     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10472     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10473     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10474     __ movzbl($dst$$Register, $src1$$Register);
10475     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10476     __ orl($dst$$Register, $tmp3$$Register);
10477     __ shrl($tmp3$$Register, 8);
10478     __ orl($dst$$Register, $tmp3$$Register);
10479     __ shrl($tmp3$$Register, 8);
10480     __ orl($dst$$Register, $tmp3$$Register);
10481     __ shrl($tmp3$$Register, 8);
10482     __ orl($dst$$Register, $tmp3$$Register);
10483     __ movsbl($dst$$Register, $dst$$Register);
10484   %}
10485   ins_pipe( pipe_slow );
10486 %}
10487 
10488 instruct rsor4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10489   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10490   match(Set dst (OrReductionV src1 src2));
10491   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10492   format %{
10493             "pshufd  $tmp,$src2,0x1\n\t"
10494             "por    $tmp,$src2\n\t"
10495             "movzwl  $dst,$src1\n\t"
10496             "pextrw  $tmp2,$tmp, 0x0\n\t"
10497             "orw    $dst,$tmp2\n\t"
10498             "pextrw  $tmp2,$tmp, 0x1\n\t"
10499             "orw    $dst,$tmp2\n\t"
10500             "movswl  $dst,$dst\t! or reduction4S" %}
10501   ins_encode %{
10502     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10503     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10504     __ movzwl($dst$$Register, $src1$$Register);
10505     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10506     __ orw($dst$$Register, $tmp2$$Register);
10507     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10508     __ orw($dst$$Register, $tmp2$$Register);
10509     __ movswl($dst$$Register, $dst$$Register);
10510   %}
10511   ins_pipe( pipe_slow );
10512 %}
10513 
10514 instruct rsor8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10515   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10516   match(Set dst (OrReductionV src1 src2));
10517   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10518   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10519             "por    $tmp,$src2\n\t"
10520             "pshufd  $tmp2,$tmp,0x1\n\t"
10521             "por    $tmp,$tmp,$tmp2\n\t"
10522             "movzwl  $dst,$src1\n\t"
10523             "pextrw  $tmp3,$tmp, 0x0\n\t"
10524             "orw    $dst,$tmp3\n\t"
10525             "pextrw  $tmp3,$tmp, 0x1\n\t"
10526             "orw    $dst,$tmp3\n\t"
10527             "movswl  $dst,$dst\t! or reduction8S" %}
10528   ins_encode %{
10529     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10530     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10531     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10532     __ por($tmp$$XMMRegister, $tmp2$$XMMRegister);
10533     __ movzwl($dst$$Register, $src1$$Register);
10534     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10535     __ orw($dst$$Register, $tmp3$$Register);
10536     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10537     __ orw($dst$$Register, $tmp3$$Register);
10538     __ movswl($dst$$Register, $dst$$Register);
10539   %}
10540   ins_pipe( pipe_slow );
10541 %}
10542 
10543 instruct rvor16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10544   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10545   match(Set dst (OrReductionV src1 src2));
10546   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10547    format %{ "vextracti128_high  $tmp,$src2\n\t"
10548             "vpor   $tmp,$tmp,$src2\n\t"
10549             "pshufd  $tmp2,$tmp,0xE\n\t"
10550             "vpor   $tmp,$tmp,$tmp2\n\t"
10551             "pshufd  $tmp2,$tmp,0x1\n\t"
10552             "vpor   $tmp,$tmp,$tmp2\n\t"
10553             "movzwl  $dst,$src1\n\t"
10554             "pextrw  $tmp3,$tmp, 0x0\n\t"
10555             "orw    $dst,$tmp3\n\t"
10556             "pextrw  $tmp3,$tmp, 0x1\n\t"
10557             "orw    $dst,$tmp3\n\t"
10558             "movswl  $dst,$dst\t! or reduction16S" %}
10559   ins_encode %{
10560     int vector_len = 0;
10561     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10562     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10563     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10564     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10565     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10566     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10567     __ movzwl($dst$$Register, $src1$$Register);
10568     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10569     __ orw($dst$$Register, $tmp3$$Register);
10570     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10571     __ orw($dst$$Register, $tmp3$$Register);
10572     __ movswl($dst$$Register, $dst$$Register);
10573   %}
10574   ins_pipe( pipe_slow );
10575 %}
10576 
10577 instruct rvor32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10578   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10579   match(Set dst (OrReductionV src1 src2));
10580   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10581   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10582             "vpor   $tmp2,$tmp2,$src2\n\t"
10583             "vextracti128_high  $tmp,$tmp2\n\t"
10584             "vpor   $tmp,$tmp,$tmp2\n\t"
10585             "pshufd  $tmp2,$tmp,0xE\n\t"
10586             "vpor   $tmp,$tmp,$tmp2\n\t"
10587             "pshufd  $tmp2,$tmp,0x1\n\t"
10588             "vpor   $tmp,$tmp,$tmp2\n\t"
10589             "movzwl  $dst,$src1\n\t"
10590             "movdl   $tmp3,$tmp\n\t"
10591             "orw    $dst,$tmp3\n\t"
10592             "shrl    $tmp3,0x16\n\t"
10593             "orw    $dst,$tmp3\n\t"
10594             "movswl  $dst,$dst\t! or reduction32S" %}
10595   ins_encode %{
10596     int vector_len = 0;
10597     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10598     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10599     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10600     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10601     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10602     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10603     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10604     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10605     __ movzwl($dst$$Register, $src1$$Register);
10606     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10607     __ orw($dst$$Register, $tmp3$$Register);
10608     __ shrl($tmp3$$Register, 16);
10609     __ orw($dst$$Register, $tmp3$$Register);
10610     __ movswl($dst$$Register, $dst$$Register);
10611   %}
10612   ins_pipe( pipe_slow );
10613 %}
10614 
10615 instruct rsor2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
10616   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10617   match(Set dst (OrReductionV src1 src2));
10618   effect(TEMP tmp, TEMP tmp2);
10619   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
10620             "por    $tmp2,$src2\n\t"
10621             "movd    $tmp,$src1\n\t"
10622             "por    $tmp2,$tmp\n\t"
10623             "movd    $dst,$tmp2\t! or reduction2I" %}
10624   ins_encode %{
10625     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
10626     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
10627     __ movdl($tmp$$XMMRegister, $src1$$Register);
10628     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
10629     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10630   %}
10631   ins_pipe( pipe_slow );
10632 %}
10633 
10634 instruct rsor4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
10635   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10636   match(Set dst (OrReductionV src1 src2));
10637   effect(TEMP tmp, TEMP tmp2);
10638   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10639             "por    $tmp2,$src2\n\t"
10640             "pshufd  $tmp,$tmp2,0x1\n\t"
10641             "por    $tmp2,$tmp\n\t"
10642             "movd    $tmp,$src1\n\t"
10643             "por    $tmp2,$tmp\n\t"
10644             "movd    $dst,$tmp2\t! or reduction4I" %}
10645   ins_encode %{
10646     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10647     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
10648     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
10649     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
10650     __ movdl($tmp$$XMMRegister, $src1$$Register);
10651     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
10652     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10653   %}
10654   ins_pipe( pipe_slow );
10655 %}
10656 
10657 instruct rvor8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
10658   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10659   match(Set dst (OrReductionV src1 src2));
10660   effect(TEMP tmp, TEMP tmp2);
10661   format %{ "vextracti128_high  $tmp,$src2\n\t"
10662             "vpor    $tmp,$tmp,$src2\n\t"
10663             "vpshufd   $tmp2,$tmp,0xE\t"
10664             "vpor    $tmp,$tmp,$tmp2\n\t"
10665             "vpshufd   $tmp2,$tmp,0x1\t"
10666             "vpor    $tmp,$tmp,$tmp2\n\t"
10667             "movd     $tmp2,$src1\n\t"
10668             "vpor    $tmp2,$tmp,$tmp2\n\t"
10669             "movd     $dst,$tmp2\t! or reduction8I" %}
10670   ins_encode %{
10671     int vector_len = 0;
10672     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10673     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10674     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10675     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10676     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10677     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10678     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10679     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10680     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10681   %}
10682   ins_pipe( pipe_slow );
10683 %}
10684 
10685 instruct rvor16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
10686   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10687   match(Set dst (OrReductionV src1 src2));
10688   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10689   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
10690             "vpor  $tmp3,$tmp3,$src2\n\t"
10691             "vextracti128_high  $tmp,$tmp3\n\t"
10692             "vpor    $tmp,$tmp,$src2\n\t"
10693             "vpshufd   $tmp2,$tmp,0xE\t"
10694             "vpor    $tmp,$tmp,$tmp2\n\t"
10695             "vpshufd   $tmp2,$tmp,0x1\t"
10696             "vpor    $tmp,$tmp,$tmp2\n\t"
10697             "movd     $tmp2,$src1\n\t"
10698             "vpor    $tmp2,$tmp,$tmp2\n\t"
10699             "movd     $dst,$tmp2\t! or reduction16I" %}
10700   ins_encode %{
10701     int vector_len = 0;
10702     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
10703     __ vpor($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
10704     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
10705     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
10706     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10707     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10708     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10709     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10710     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10711     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10712     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10713   %}
10714   ins_pipe( pipe_slow );
10715 %}
10716 
10717 instruct rsor2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
10718   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10719   match(Set dst (OrReductionV src1 src2));
10720   effect(TEMP tmp, TEMP tmp2);
10721   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10722             "por    $tmp2,$src2\n\t"
10723             "movdq   $tmp,$src1\n\t"
10724             "por    $tmp2,$tmp\n\t"
10725             "movq   $dst,$tmp2\t! or reduction2L" %}
10726   ins_encode %{
10727     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10728     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
10729     __ movdq($tmp$$XMMRegister, $src1$$Register);
10730     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
10731     __ movq($dst$$Register, $tmp2$$XMMRegister);
10732   %}
10733   ins_pipe( pipe_slow );
10734 %}
10735 
10736 instruct rvor4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
10737   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10738   match(Set dst (OrReductionV src1 src2));
10739   effect(TEMP tmp, TEMP tmp2);
10740   format %{ "vextracti128_high  $tmp,$src2\n\t"
10741             "vpor  $tmp2,$tmp,$src2\n\t"
10742             "vpshufd  $tmp,$tmp2,0xE\t"
10743             "vpor  $tmp2,$tmp2,$tmp\n\t"
10744             "movq   $tmp,$src1\n\t"
10745             "vpor  $tmp2,$tmp2,$tmp\n\t"
10746             "movq   $dst,$tmp2\t! or reduction4L" %}
10747   ins_encode %{
10748     int vector_len = 0;
10749     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10750     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10751     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
10752     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10753     __ movq($tmp$$XMMRegister, $src1$$Register);
10754     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10755     __ movq($dst$$Register, $tmp2$$XMMRegister);
10756   %}
10757   ins_pipe( pipe_slow );
10758 %}
10759 
10760 #ifdef _LP64
10761 instruct rvor8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
10762   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10763   match(Set dst (OrReductionV src1 src2));
10764   effect(TEMP tmp, TEMP tmp2);
10765   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10766             "vporq  $tmp2,$tmp2,$src2\n\t"
10767             "vextracti128_high  $tmp,$tmp2\n\t"
10768             "vporq  $tmp2,$tmp2,$tmp\n\t"
10769             "vpshufd  $tmp,$tmp2,0xE\t"
10770             "vporq  $tmp2,$tmp2,$tmp\n\t"
10771             "movdq   $tmp,$src1\n\t"
10772             "vporq  $tmp2,$tmp2,$tmp\n\t"
10773             "movdq   $dst,$tmp2\t! or reduction8L" %}
10774   ins_encode %{
10775     int vector_len = 0;
10776     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10777     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10778     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10779     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10780     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
10781     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10782     __ movdq($tmp$$XMMRegister, $src1$$Register);
10783     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10784     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10785   %}
10786   ins_pipe( pipe_slow );
10787 %}
10788 #endif
10789 
10790 instruct rsxor8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10791   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10792   match(Set dst (XorReductionV src1 src2));
10793   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10794   format %{
10795             "pshufd  $tmp,$src2,0x1\n\t"
10796             "pxor    $tmp,$src2\n\t"
10797             "movzbl  $dst,$src1\n\t"
10798             "pextrb  $tmp2,$tmp, 0x0\n\t"
10799             "xorl    $dst,$tmp2\n\t"
10800             "pextrb  $tmp2,$tmp, 0x1\n\t"
10801             "xorl    $dst,$tmp2\n\t"
10802             "pextrb  $tmp2,$tmp, 0x2\n\t"
10803             "xorl    $dst,$tmp2\n\t"
10804             "pextrb  $tmp2,$tmp, 0x3\n\t"
10805             "xorl    $dst,$tmp2\n\t"
10806             "movsbl  $dst,$dst\t! xor reduction8B" %}
10807   ins_encode %{
10808     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10809     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
10810     __ movzbl($dst$$Register, $src1$$Register);
10811     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10812     __ xorl($dst$$Register, $tmp2$$Register);
10813     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10814     __ xorl($dst$$Register, $tmp2$$Register);
10815     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
10816     __ xorl($dst$$Register, $tmp2$$Register);
10817     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
10818     __ xorl($dst$$Register, $tmp2$$Register);
10819     __ movsbl($dst$$Register, $dst$$Register);
10820   %}
10821   ins_pipe( pipe_slow );
10822 %}
10823 
10824 instruct rsxor16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10825   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10826   match(Set dst (XorReductionV src1 src2));
10827   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10828   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10829             "pxor    $tmp,$src2\n\t"
10830             "pshufd  $tmp2,$tmp,0x1\n\t"
10831             "pxor    $tmp,$tmp,$tmp2\n\t"
10832             "movzbl  $dst,$src1\n\t"
10833             "pextrb  $tmp3,$tmp, 0x0\n\t"
10834             "xorl    $dst,$tmp3\n\t"
10835             "pextrb  $tmp3,$tmp, 0x1\n\t"
10836             "xorl    $dst,$tmp3\n\t"
10837             "pextrb  $tmp3,$tmp, 0x2\n\t"
10838             "xorl    $dst,$tmp3\n\t"
10839             "pextrb  $tmp3,$tmp, 0x3\n\t"
10840             "xorl    $dst,$tmp3\n\t"
10841             "movsbl  $dst,$dst\t! xor reduction16B" %}
10842   ins_encode %{
10843     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10844     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
10845     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10846     __ pxor($tmp$$XMMRegister, $tmp2$$XMMRegister);
10847     __ movzbl($dst$$Register, $src1$$Register);
10848     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10849     __ xorl($dst$$Register, $tmp3$$Register);
10850     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10851     __ xorl($dst$$Register, $tmp3$$Register);
10852     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10853     __ xorl($dst$$Register, $tmp3$$Register);
10854     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10855     __ xorl($dst$$Register, $tmp3$$Register);
10856     __ movsbl($dst$$Register, $dst$$Register);
10857   %}
10858   ins_pipe( pipe_slow );
10859 %}
10860 
10861 instruct rvxor32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10862   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10863   match(Set dst (XorReductionV src1 src2));
10864   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10865    format %{ "vextracti128_high  $tmp,$src2\n\t"
10866             "vpxor   $tmp,$tmp,$src2\n\t"
10867             "pshufd  $tmp2,$tmp,0xE\n\t"
10868             "vpxor   $tmp,$tmp,$tmp2\n\t"
10869             "pshufd  $tmp2,$tmp,0x1\n\t"
10870             "vpxor   $tmp,$tmp,$tmp2\n\t"
10871             "movzbl  $dst,$src1\n\t"
10872             "pextrb  $tmp3,$tmp, 0x0\n\t"
10873             "xorl    $dst,$tmp3\n\t"
10874             "pextrb  $tmp3,$tmp, 0x1\n\t"
10875             "xorl    $dst,$tmp3\n\t"
10876             "pextrb  $tmp3,$tmp, 0x2\n\t"
10877             "xorl    $dst,$tmp3\n\t"
10878             "pextrb  $tmp3,$tmp, 0x3\n\t"
10879             "xorl    $dst,$tmp3\n\t"
10880             "movsbl  $dst,$dst\t! xor reduction32B" %}
10881   ins_encode %{
10882     int vector_len = 0;
10883     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10884     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10885     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10886     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10887     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10888     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10889     __ movzbl($dst$$Register, $src1$$Register);
10890     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10891     __ xorl($dst$$Register, $tmp3$$Register);
10892     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10893     __ xorl($dst$$Register, $tmp3$$Register);
10894     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10895     __ xorl($dst$$Register, $tmp3$$Register);
10896     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10897     __ xorl($dst$$Register, $tmp3$$Register);
10898     __ movsbl($dst$$Register, $dst$$Register);
10899   %}
10900   ins_pipe( pipe_slow );
10901 %}
10902 
10903 instruct rvxor64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10904   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10905   match(Set dst (XorReductionV src1 src2));
10906   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10907   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10908             "vpxor   $tmp2,$tmp2,$src2\n\t"
10909             "vextracti128_high  $tmp,$tmp2\n\t"
10910             "vpxor   $tmp,$tmp,$tmp2\n\t"
10911             "pshufd  $tmp2,$tmp,0xE\n\t"
10912             "vpxor   $tmp,$tmp,$tmp2\n\t"
10913             "pshufd  $tmp2,$tmp,0x1\n\t"
10914             "vpxor   $tmp,$tmp,$tmp2\n\t"
10915             "movzbl  $dst,$src1\n\t"
10916             "movdl   $tmp3,$tmp\n\t"
10917             "xorl    $dst,$tmp3\n\t"
10918             "shrl    $tmp3,0x8\n\t"
10919             "xorl    $dst,$tmp3\n\t"
10920             "shrl    $tmp3,0x8\n\t"
10921             "xorl    $dst,$tmp3\n\t"
10922             "shrl    $tmp3,0x8\n\t"
10923             "xorl    $dst,$tmp3\n\t"
10924             "movsbl  $dst,$dst\t! xor reduction64B" %}
10925   ins_encode %{
10926     int vector_len = 0;
10927     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10928     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10929     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10930     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10931     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10932     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10933     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10934     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10935     __ movzbl($dst$$Register, $src1$$Register);
10936     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10937     __ xorl($dst$$Register, $tmp3$$Register);
10938     __ shrl($tmp3$$Register, 8);
10939     __ xorl($dst$$Register, $tmp3$$Register);
10940     __ shrl($tmp3$$Register, 8);
10941     __ xorl($dst$$Register, $tmp3$$Register);
10942     __ shrl($tmp3$$Register, 8);
10943     __ xorl($dst$$Register, $tmp3$$Register);
10944     __ movsbl($dst$$Register, $dst$$Register);
10945   %}
10946   ins_pipe( pipe_slow );
10947 %}
10948 
10949 instruct rsxor4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10950   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10951   match(Set dst (XorReductionV src1 src2));
10952   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10953   format %{
10954             "pshufd  $tmp,$src2,0x1\n\t"
10955             "pxor    $tmp,$src2\n\t"
10956             "movzwl  $dst,$src1\n\t"
10957             "pextrw  $tmp2,$tmp, 0x0\n\t"
10958             "xorw    $dst,$tmp2\n\t"
10959             "pextrw  $tmp2,$tmp, 0x1\n\t"
10960             "xorw    $dst,$tmp2\n\t"
10961             "movswl  $dst,$dst\t! xor reduction4S" %}
10962   ins_encode %{
10963     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10964     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
10965     __ movzwl($dst$$Register, $src1$$Register);
10966     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10967     __ xorw($dst$$Register, $tmp2$$Register);
10968     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10969     __ xorw($dst$$Register, $tmp2$$Register);
10970     __ movswl($dst$$Register, $dst$$Register);
10971   %}
10972   ins_pipe( pipe_slow );
10973 %}
10974 
10975 instruct rsxor8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10976   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10977   match(Set dst (XorReductionV src1 src2));
10978   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10979   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10980             "pxor    $tmp,$src2\n\t"
10981             "pshufd  $tmp2,$tmp,0x1\n\t"
10982             "pxor    $tmp,$tmp,$tmp2\n\t"
10983             "movzwl  $dst,$src1\n\t"
10984             "pextrw  $tmp3,$tmp, 0x0\n\t"
10985             "xorw    $dst,$tmp3\n\t"
10986             "pextrw  $tmp3,$tmp, 0x1\n\t"
10987             "xorw    $dst,$tmp3\n\t"
10988             "movswl  $dst,$dst\t! xor reduction8S" %}
10989   ins_encode %{
10990     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10991     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
10992     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10993     __ pxor($tmp$$XMMRegister, $tmp2$$XMMRegister);
10994     __ movzwl($dst$$Register, $src1$$Register);
10995     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10996     __ xorw($dst$$Register, $tmp3$$Register);
10997     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10998     __ xorw($dst$$Register, $tmp3$$Register);
10999     __ movswl($dst$$Register, $dst$$Register);
11000   %}
11001   ins_pipe( pipe_slow );
11002 %}
11003 
11004 instruct rvxor16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11005   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11006   match(Set dst (XorReductionV src1 src2));
11007   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11008    format %{ "vextracti128_high  $tmp,$src2\n\t"
11009             "vpxor   $tmp,$tmp,$src2\n\t"
11010             "pshufd  $tmp2,$tmp,0xE\n\t"
11011             "vpxor   $tmp,$tmp,$tmp2\n\t"
11012             "pshufd  $tmp2,$tmp,0x1\n\t"
11013             "vpxor   $tmp,$tmp,$tmp2\n\t"
11014             "movzwl  $dst,$src1\n\t"
11015             "pextrw  $tmp3,$tmp, 0x0\n\t"
11016             "xorw    $dst,$tmp3\n\t"
11017             "pextrw  $tmp3,$tmp, 0x1\n\t"
11018             "xorw    $dst,$tmp3\n\t"
11019             "movswl  $dst,$dst\t! xor reduction16S" %}
11020   ins_encode %{
11021     int vector_len = 0;
11022     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11023     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11024     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11025     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11026     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11027     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11028     __ movzwl($dst$$Register, $src1$$Register);
11029     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11030     __ xorw($dst$$Register, $tmp3$$Register);
11031     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11032     __ xorw($dst$$Register, $tmp3$$Register);
11033     __ movswl($dst$$Register, $dst$$Register);
11034   %}
11035   ins_pipe( pipe_slow );
11036 %}
11037 
11038 instruct rvxor32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11039   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11040   match(Set dst (XorReductionV src1 src2));
11041   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11042   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11043             "vpxor   $tmp2,$tmp2,$src2\n\t"
11044             "vextracti128_high  $tmp,$tmp2\n\t"
11045             "vpxor   $tmp,$tmp,$tmp2\n\t"
11046             "pshufd  $tmp2,$tmp,0xE\n\t"
11047             "vpxor   $tmp,$tmp,$tmp2\n\t"
11048             "pshufd  $tmp2,$tmp,0x1\n\t"
11049             "vpxor   $tmp,$tmp,$tmp2\n\t"
11050             "movzwl  $dst,$src1\n\t"
11051             "movdl   $tmp3,$tmp\n\t"
11052             "xorw    $dst,$tmp3\n\t"
11053             "shrl    $tmp3,0x16\n\t"
11054             "xorw    $dst,$tmp3\n\t"
11055             "movswl  $dst,$dst\t! xor reduction32S" %}
11056   ins_encode %{
11057     int vector_len = 0;
11058     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11059     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11060     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11061     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11062     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11063     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11064     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11065     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11066     __ movzwl($dst$$Register, $src1$$Register);
11067     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11068     __ xorw($dst$$Register, $tmp3$$Register);
11069     __ shrl($tmp3$$Register, 16);
11070     __ xorw($dst$$Register, $tmp3$$Register);
11071     __ movswl($dst$$Register, $dst$$Register);
11072   %}
11073   ins_pipe( pipe_slow );
11074 %}
11075 
11076 instruct rsxor2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
11077   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11078   match(Set dst (XorReductionV src1 src2));
11079   effect(TEMP tmp, TEMP tmp2);
11080   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
11081             "pxor    $tmp2,$src2\n\t"
11082             "movd    $tmp,$src1\n\t"
11083             "pxor    $tmp2,$tmp\n\t"
11084             "movd    $dst,$tmp2\t! xor reduction2I" %}
11085   ins_encode %{
11086     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
11087     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11088     __ movdl($tmp$$XMMRegister, $src1$$Register);
11089     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11090     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11091   %}
11092   ins_pipe( pipe_slow );
11093 %}
11094 
11095 instruct rsxor4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
11096   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11097   match(Set dst (XorReductionV src1 src2));
11098   effect(TEMP tmp, TEMP tmp2);
11099   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11100             "pxor    $tmp2,$src2\n\t"
11101             "pshufd  $tmp,$tmp2,0x1\n\t"
11102             "pxor    $tmp2,$tmp\n\t"
11103             "movd    $tmp,$src1\n\t"
11104             "pxor    $tmp2,$tmp\n\t"
11105             "movd    $dst,$tmp2\t! xor reduction4I" %}
11106   ins_encode %{
11107     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11108     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11109     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
11110     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11111     __ movdl($tmp$$XMMRegister, $src1$$Register);
11112     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11113     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11114   %}
11115   ins_pipe( pipe_slow );
11116 %}
11117 
11118 instruct rvxor8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
11119   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11120   match(Set dst (XorReductionV src1 src2));
11121   effect(TEMP tmp, TEMP tmp2);
11122   format %{ "vextracti128_high  $tmp,$src2\n\t"
11123             "vpxor    $tmp,$tmp,$src2\n\t"
11124             "vpshufd   $tmp2,$tmp,0xE\t"
11125             "vpxor    $tmp,$tmp,$tmp2\n\t"
11126             "vpshufd   $tmp2,$tmp,0x1\t"
11127             "vpxor    $tmp,$tmp,$tmp2\n\t"
11128             "movd     $tmp2,$src1\n\t"
11129             "vpxor    $tmp2,$tmp,$tmp2\n\t"
11130             "movd     $dst,$tmp2\t! xor reduction8I" %}
11131   ins_encode %{
11132     int vector_len = 0;
11133     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11134     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11135     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11136     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11137     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11138     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11139     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11140     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11141     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11142   %}
11143   ins_pipe( pipe_slow );
11144 %}
11145 
11146 instruct rvxor16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
11147   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11148   match(Set dst (XorReductionV src1 src2));
11149   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
11150   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
11151             "vpxor  $tmp3,$tmp3,$src2\n\t"
11152             "vextracti128_high  $tmp,$tmp3\n\t"
11153             "vpxor    $tmp,$tmp,$src2\n\t"
11154             "vpshufd   $tmp2,$tmp,0xE\t"
11155             "vpxor    $tmp,$tmp,$tmp2\n\t"
11156             "vpshufd   $tmp2,$tmp,0x1\t"
11157             "vpxor    $tmp,$tmp,$tmp2\n\t"
11158             "movd     $tmp2,$src1\n\t"
11159             "vpxor    $tmp2,$tmp,$tmp2\n\t"
11160             "movd     $dst,$tmp2\t! xor reduction16I" %}
11161   ins_encode %{
11162     int vector_len = 0;
11163     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
11164     __ vpxor($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
11165     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
11166     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
11167     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11168     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11169     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11170     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11171     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11172     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11173     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11174   %}
11175   ins_pipe( pipe_slow );
11176 %}
11177 
11178 instruct rsxor2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
11179   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11180   match(Set dst (XorReductionV src1 src2));
11181   effect(TEMP tmp, TEMP tmp2);
11182   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11183             "pxor    $tmp2,$src2\n\t"
11184             "movdq   $tmp,$src1\n\t"
11185             "pxor    $tmp2,$tmp\n\t"
11186             "movq   $dst,$tmp2\t! xor reduction2L" %}
11187   ins_encode %{
11188     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11189     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11190     __ movdq($tmp$$XMMRegister, $src1$$Register);
11191     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11192     __ movq($dst$$Register, $tmp2$$XMMRegister);
11193   %}
11194   ins_pipe( pipe_slow );
11195 %}
11196 
11197 instruct rvxor4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
11198   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11199   match(Set dst (XorReductionV src1 src2));
11200   effect(TEMP tmp, TEMP tmp2);
11201   format %{ "vextracti128_high  $tmp,$src2\n\t"
11202             "vpxor  $tmp2,$tmp,$src2\n\t"
11203             "vpshufd  $tmp,$tmp2,0xE\t"
11204             "vpxor  $tmp2,$tmp2,$tmp\n\t"
11205             "movq   $tmp,$src1\n\t"
11206             "vpxor  $tmp2,$tmp2,$tmp\n\t"
11207             "movq   $dst,$tmp2\t! xor reduction4L" %}
11208   ins_encode %{
11209     int vector_len = 0;
11210     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11211     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11212     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11213     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11214     __ movq($tmp$$XMMRegister, $src1$$Register);
11215     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11216     __ movq($dst$$Register, $tmp2$$XMMRegister);
11217   %}
11218   ins_pipe( pipe_slow );
11219 %}
11220 
11221 #ifdef _LP64
11222 instruct rvxor8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
11223   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11224   match(Set dst (XorReductionV src1 src2));
11225   effect(TEMP tmp, TEMP tmp2);
11226   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11227             "vpxorq  $tmp2,$tmp2,$src2\n\t"
11228             "vextracti128_high  $tmp,$tmp2\n\t"
11229             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11230             "vpshufd  $tmp,$tmp2,0xE\t"
11231             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11232             "movdq   $tmp,$src1\n\t"
11233             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11234             "movdq   $dst,$tmp2\t! xor reduction8L" %}
11235   ins_encode %{
11236     int vector_len = 0;
11237     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11238     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11239     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11240     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11241     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11242     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11243     __ movdq($tmp$$XMMRegister, $src1$$Register);
11244     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11245     __ movdq($dst$$Register, $tmp2$$XMMRegister);
11246   %}
11247   ins_pipe( pipe_slow );
11248 %}
11249 #endif
11250 
11251 // ====================VECTOR ARITHMETIC=======================================
11252 
11253 // --------------------------------- ADD --------------------------------------
11254 
11255 // Bytes vector add
11256 instruct vadd4B(vecS dst, vecS src) %{
11257   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
11258   match(Set dst (AddVB dst src));
11259   format %{ "paddb   $dst,$src\t! add packed4B" %}
11260   ins_encode %{
11261     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11262   %}
11263   ins_pipe( pipe_slow );
11264 %}
11265 
11266 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
11267   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
11268   match(Set dst (AddVB src1 src2));
11269   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
11270   ins_encode %{
11271     int vector_len = 0;
11272     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11273   %}
11274   ins_pipe( pipe_slow );
11275 %}
11276 
11277 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
11278   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
11279   match(Set dst (AddVB src1 src2));
11280   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
11281   ins_encode %{
11282     int vector_len = 0;
11283     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11284   %}
11285   ins_pipe( pipe_slow );
11286 %}
11287 
11288 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
11289   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
11290   match(Set dst (AddVB dst src2));
11291   effect(TEMP src1);
11292   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
11293   ins_encode %{
11294     int vector_len = 0;
11295     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11296   %}
11297   ins_pipe( pipe_slow );
11298 %}
11299 
11300 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
11301   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
11302   match(Set dst (AddVB src (LoadVector mem)));
11303   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
11304   ins_encode %{
11305     int vector_len = 0;
11306     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11307   %}
11308   ins_pipe( pipe_slow );
11309 %}
11310 
11311 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
11312   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
11313   match(Set dst (AddVB src (LoadVector mem)));
11314   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
11315   ins_encode %{
11316     int vector_len = 0;
11317     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11318   %}
11319   ins_pipe( pipe_slow );
11320 %}
11321 
11322 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
11323   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
11324   match(Set dst (AddVB dst (LoadVector mem)));
11325   effect(TEMP src);
11326   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
11327   ins_encode %{
11328     int vector_len = 0;
11329     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11330   %}
11331   ins_pipe( pipe_slow );
11332 %}
11333 
11334 instruct vadd8B(vecD dst, vecD src) %{
11335   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
11336   match(Set dst (AddVB dst src));
11337   format %{ "paddb   $dst,$src\t! add packed8B" %}
11338   ins_encode %{
11339     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11340   %}
11341   ins_pipe( pipe_slow );
11342 %}
11343 
11344 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
11345   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
11346   match(Set dst (AddVB src1 src2));
11347   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
11348   ins_encode %{
11349     int vector_len = 0;
11350     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11351   %}
11352   ins_pipe( pipe_slow );
11353 %}
11354 
11355 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
11356   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
11357   match(Set dst (AddVB src1 src2));
11358   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
11359   ins_encode %{
11360     int vector_len = 0;
11361     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11362   %}
11363   ins_pipe( pipe_slow );
11364 %}
11365 
11366 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
11367   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
11368   match(Set dst (AddVB dst src2));
11369   effect(TEMP src1);
11370   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
11371   ins_encode %{
11372     int vector_len = 0;
11373     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11374   %}
11375   ins_pipe( pipe_slow );
11376 %}
11377 
11378 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
11379   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
11380   match(Set dst (AddVB src (LoadVector mem)));
11381   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
11382   ins_encode %{
11383     int vector_len = 0;
11384     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11385   %}
11386   ins_pipe( pipe_slow );
11387 %}
11388 
11389 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
11390   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
11391   match(Set dst (AddVB src (LoadVector mem)));
11392   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
11393   ins_encode %{
11394     int vector_len = 0;
11395     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11396   %}
11397   ins_pipe( pipe_slow );
11398 %}
11399 
11400 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
11401   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
11402   match(Set dst (AddVB dst (LoadVector mem)));
11403   effect(TEMP src);
11404   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
11405   ins_encode %{
11406     int vector_len = 0;
11407     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11408   %}
11409   ins_pipe( pipe_slow );
11410 %}
11411 
11412 instruct vadd16B(vecX dst, vecX src) %{
11413   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
11414   match(Set dst (AddVB dst src));
11415   format %{ "paddb   $dst,$src\t! add packed16B" %}
11416   ins_encode %{
11417     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11418   %}
11419   ins_pipe( pipe_slow );
11420 %}
11421 
11422 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
11423   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
11424   match(Set dst (AddVB src1 src2));
11425   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
11426   ins_encode %{
11427     int vector_len = 0;
11428     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11429   %}
11430   ins_pipe( pipe_slow );
11431 %}
11432 
11433 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
11434   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
11435   match(Set dst (AddVB src1 src2));
11436   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
11437   ins_encode %{
11438     int vector_len = 0;
11439     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11440   %}
11441   ins_pipe( pipe_slow );
11442 %}
11443 
11444 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
11445   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
11446   match(Set dst (AddVB dst src2));
11447   effect(TEMP src1);
11448   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
11449   ins_encode %{
11450     int vector_len = 0;
11451     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11452   %}
11453   ins_pipe( pipe_slow );
11454 %}
11455 
11456 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
11457   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
11458   match(Set dst (AddVB src (LoadVector mem)));
11459   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
11460   ins_encode %{
11461     int vector_len = 0;
11462     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11463   %}
11464   ins_pipe( pipe_slow );
11465 %}
11466 
11467 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
11468   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
11469   match(Set dst (AddVB src (LoadVector mem)));
11470   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
11471   ins_encode %{
11472     int vector_len = 0;
11473     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11474   %}
11475   ins_pipe( pipe_slow );
11476 %}
11477 
11478 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
11479   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
11480   match(Set dst (AddVB dst (LoadVector mem)));
11481   effect(TEMP src);
11482   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
11483   ins_encode %{
11484     int vector_len = 0;
11485     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11486   %}
11487   ins_pipe( pipe_slow );
11488 %}
11489 
11490 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
11491   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
11492   match(Set dst (AddVB src1 src2));
11493   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
11494   ins_encode %{
11495     int vector_len = 1;
11496     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11497   %}
11498   ins_pipe( pipe_slow );
11499 %}
11500 
11501 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
11502   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
11503   match(Set dst (AddVB src1 src2));
11504   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
11505   ins_encode %{
11506     int vector_len = 1;
11507     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11508   %}
11509   ins_pipe( pipe_slow );
11510 %}
11511 
11512 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
11513   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
11514   match(Set dst (AddVB dst src2));
11515   effect(TEMP src1);
11516   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
11517   ins_encode %{
11518     int vector_len = 1;
11519     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11520   %}
11521   ins_pipe( pipe_slow );
11522 %}
11523 
11524 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
11525   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
11526   match(Set dst (AddVB src (LoadVector mem)));
11527   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
11528   ins_encode %{
11529     int vector_len = 1;
11530     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11531   %}
11532   ins_pipe( pipe_slow );
11533 %}
11534 
11535 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
11536   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
11537   match(Set dst (AddVB src (LoadVector mem)));
11538   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
11539   ins_encode %{
11540     int vector_len = 1;
11541     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11542   %}
11543   ins_pipe( pipe_slow );
11544 %}
11545 
11546 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
11547   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
11548   match(Set dst (AddVB dst (LoadVector mem)));
11549   effect(TEMP src);
11550   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
11551   ins_encode %{
11552     int vector_len = 1;
11553     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11554   %}
11555   ins_pipe( pipe_slow );
11556 %}
11557 
11558 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
11559   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
11560   match(Set dst (AddVB src1 src2));
11561   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
11562   ins_encode %{
11563     int vector_len = 2;
11564     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11565   %}
11566   ins_pipe( pipe_slow );
11567 %}
11568 
11569 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
11570   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
11571   match(Set dst (AddVB src (LoadVector mem)));
11572   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
11573   ins_encode %{
11574     int vector_len = 2;
11575     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11576   %}
11577   ins_pipe( pipe_slow );
11578 %}
11579 
11580 // Shorts/Chars vector add
11581 instruct vadd2S(vecS dst, vecS src) %{
11582   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
11583   match(Set dst (AddVS dst src));
11584   format %{ "paddw   $dst,$src\t! add packed2S" %}
11585   ins_encode %{
11586     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
11587   %}
11588   ins_pipe( pipe_slow );
11589 %}
11590 
11591 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
11592   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
11593   match(Set dst (AddVS src1 src2));
11594   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
11595   ins_encode %{
11596     int vector_len = 0;
11597     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11598   %}
11599   ins_pipe( pipe_slow );
11600 %}
11601 
11602 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
11603   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
11604   match(Set dst (AddVS src1 src2));
11605   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
11606   ins_encode %{
11607     int vector_len = 0;
11608     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11609   %}
11610   ins_pipe( pipe_slow );
11611 %}
11612 
11613 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
11614   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
11615   match(Set dst (AddVS dst src2));
11616   effect(TEMP src1);
11617   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
11618   ins_encode %{
11619     int vector_len = 0;
11620     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11621   %}
11622   ins_pipe( pipe_slow );
11623 %}
11624 
11625 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
11626   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
11627   match(Set dst (AddVS src (LoadVector mem)));
11628   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
11629   ins_encode %{
11630     int vector_len = 0;
11631     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11632   %}
11633   ins_pipe( pipe_slow );
11634 %}
11635 
11636 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
11637   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
11638   match(Set dst (AddVS src (LoadVector mem)));
11639   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
11640   ins_encode %{
11641     int vector_len = 0;
11642     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11643   %}
11644   ins_pipe( pipe_slow );
11645 %}
11646 
11647 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
11648   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
11649   match(Set dst (AddVS dst (LoadVector mem)));
11650   effect(TEMP src);
11651   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
11652   ins_encode %{
11653     int vector_len = 0;
11654     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11655   %}
11656   ins_pipe( pipe_slow );
11657 %}
11658 
11659 instruct vadd4S(vecD dst, vecD src) %{
11660   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
11661   match(Set dst (AddVS dst src));
11662   format %{ "paddw   $dst,$src\t! add packed4S" %}
11663   ins_encode %{
11664     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
11665   %}
11666   ins_pipe( pipe_slow );
11667 %}
11668 
11669 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
11670   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
11671   match(Set dst (AddVS src1 src2));
11672   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
11673   ins_encode %{
11674     int vector_len = 0;
11675     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11676   %}
11677   ins_pipe( pipe_slow );
11678 %}
11679 
11680 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
11681   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
11682   match(Set dst (AddVS src1 src2));
11683   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
11684   ins_encode %{
11685     int vector_len = 0;
11686     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11687   %}
11688   ins_pipe( pipe_slow );
11689 %}
11690 
11691 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
11692   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
11693   match(Set dst (AddVS dst src2));
11694   effect(TEMP src1);
11695   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
11696   ins_encode %{
11697     int vector_len = 0;
11698     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11699   %}
11700   ins_pipe( pipe_slow );
11701 %}
11702 
11703 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
11704   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
11705   match(Set dst (AddVS src (LoadVector mem)));
11706   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
11707   ins_encode %{
11708     int vector_len = 0;
11709     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11710   %}
11711   ins_pipe( pipe_slow );
11712 %}
11713 
11714 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
11715   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
11716   match(Set dst (AddVS src (LoadVector mem)));
11717   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
11718   ins_encode %{
11719     int vector_len = 0;
11720     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11721   %}
11722   ins_pipe( pipe_slow );
11723 %}
11724 
11725 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
11726   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
11727   match(Set dst (AddVS dst (LoadVector mem)));
11728   effect(TEMP src);
11729   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
11730   ins_encode %{
11731     int vector_len = 0;
11732     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11733   %}
11734   ins_pipe( pipe_slow );
11735 %}
11736 
11737 instruct vadd8S(vecX dst, vecX src) %{
11738   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
11739   match(Set dst (AddVS dst src));
11740   format %{ "paddw   $dst,$src\t! add packed8S" %}
11741   ins_encode %{
11742     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
11743   %}
11744   ins_pipe( pipe_slow );
11745 %}
11746 
11747 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
11748   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
11749   match(Set dst (AddVS src1 src2));
11750   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
11751   ins_encode %{
11752     int vector_len = 0;
11753     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11754   %}
11755   ins_pipe( pipe_slow );
11756 %}
11757 
11758 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
11759   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
11760   match(Set dst (AddVS src1 src2));
11761   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
11762   ins_encode %{
11763     int vector_len = 0;
11764     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11765   %}
11766   ins_pipe( pipe_slow );
11767 %}
11768 
11769 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
11770   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
11771   match(Set dst (AddVS dst src2));
11772   effect(TEMP src1);
11773   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
11774   ins_encode %{
11775     int vector_len = 0;
11776     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11777   %}
11778   ins_pipe( pipe_slow );
11779 %}
11780 
11781 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
11782   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
11783   match(Set dst (AddVS src (LoadVector mem)));
11784   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
11785   ins_encode %{
11786     int vector_len = 0;
11787     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11788   %}
11789   ins_pipe( pipe_slow );
11790 %}
11791 
11792 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
11793   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
11794   match(Set dst (AddVS src (LoadVector mem)));
11795   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
11796   ins_encode %{
11797     int vector_len = 0;
11798     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11799   %}
11800   ins_pipe( pipe_slow );
11801 %}
11802 
11803 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
11804   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
11805   match(Set dst (AddVS dst (LoadVector mem)));
11806   effect(TEMP src);
11807   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
11808   ins_encode %{
11809     int vector_len = 0;
11810     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11811   %}
11812   ins_pipe( pipe_slow );
11813 %}
11814 
11815 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
11816   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
11817   match(Set dst (AddVS src1 src2));
11818   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
11819   ins_encode %{
11820     int vector_len = 1;
11821     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11822   %}
11823   ins_pipe( pipe_slow );
11824 %}
11825 
11826 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
11827   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
11828   match(Set dst (AddVS src1 src2));
11829   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
11830   ins_encode %{
11831     int vector_len = 1;
11832     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11833   %}
11834   ins_pipe( pipe_slow );
11835 %}
11836 
11837 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
11838   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
11839   match(Set dst (AddVS dst src2));
11840   effect(TEMP src1);
11841   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
11842   ins_encode %{
11843     int vector_len = 1;
11844     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11845   %}
11846   ins_pipe( pipe_slow );
11847 %}
11848 
11849 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
11850   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
11851   match(Set dst (AddVS src (LoadVector mem)));
11852   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
11853   ins_encode %{
11854     int vector_len = 1;
11855     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11856   %}
11857   ins_pipe( pipe_slow );
11858 %}
11859 
11860 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
11861   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
11862   match(Set dst (AddVS src (LoadVector mem)));
11863   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
11864   ins_encode %{
11865     int vector_len = 1;
11866     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11867   %}
11868   ins_pipe( pipe_slow );
11869 %}
11870 
11871 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
11872   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
11873   match(Set dst (AddVS dst (LoadVector mem)));
11874   effect(TEMP src);
11875   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
11876   ins_encode %{
11877     int vector_len = 1;
11878     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11879   %}
11880   ins_pipe( pipe_slow );
11881 %}
11882 
11883 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
11884   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
11885   match(Set dst (AddVS src1 src2));
11886   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
11887   ins_encode %{
11888     int vector_len = 2;
11889     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11890   %}
11891   ins_pipe( pipe_slow );
11892 %}
11893 
11894 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
11895   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
11896   match(Set dst (AddVS src (LoadVector mem)));
11897   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
11898   ins_encode %{
11899     int vector_len = 2;
11900     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11901   %}
11902   ins_pipe( pipe_slow );
11903 %}
11904 
11905 // Integers vector add
11906 instruct vadd2I(vecD dst, vecD src) %{
11907   predicate(n->as_Vector()->length() == 2);
11908   match(Set dst (AddVI dst src));
11909   format %{ "paddd   $dst,$src\t! add packed2I" %}
11910   ins_encode %{
11911     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
11912   %}
11913   ins_pipe( pipe_slow );
11914 %}
11915 
11916 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
11917   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
11918   match(Set dst (AddVI src1 src2));
11919   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
11920   ins_encode %{
11921     int vector_len = 0;
11922     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11923   %}
11924   ins_pipe( pipe_slow );
11925 %}
11926 
11927 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
11928   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
11929   match(Set dst (AddVI src (LoadVector mem)));
11930   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
11931   ins_encode %{
11932     int vector_len = 0;
11933     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11934   %}
11935   ins_pipe( pipe_slow );
11936 %}
11937 
11938 instruct vadd4I(vecX dst, vecX src) %{
11939   predicate(n->as_Vector()->length() == 4);
11940   match(Set dst (AddVI dst src));
11941   format %{ "paddd   $dst,$src\t! add packed4I" %}
11942   ins_encode %{
11943     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
11944   %}
11945   ins_pipe( pipe_slow );
11946 %}
11947 
11948 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
11949   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
11950   match(Set dst (AddVI src1 src2));
11951   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
11952   ins_encode %{
11953     int vector_len = 0;
11954     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11955   %}
11956   ins_pipe( pipe_slow );
11957 %}
11958 
11959 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
11960   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
11961   match(Set dst (AddVI src (LoadVector mem)));
11962   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
11963   ins_encode %{
11964     int vector_len = 0;
11965     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11966   %}
11967   ins_pipe( pipe_slow );
11968 %}
11969 
11970 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
11971   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
11972   match(Set dst (AddVI src1 src2));
11973   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
11974   ins_encode %{
11975     int vector_len = 1;
11976     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11977   %}
11978   ins_pipe( pipe_slow );
11979 %}
11980 
11981 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
11982   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
11983   match(Set dst (AddVI src (LoadVector mem)));
11984   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
11985   ins_encode %{
11986     int vector_len = 1;
11987     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11988   %}
11989   ins_pipe( pipe_slow );
11990 %}
11991 
11992 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
11993   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
11994   match(Set dst (AddVI src1 src2));
11995   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
11996   ins_encode %{
11997     int vector_len = 2;
11998     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11999   %}
12000   ins_pipe( pipe_slow );
12001 %}
12002 
12003 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
12004   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12005   match(Set dst (AddVI src (LoadVector mem)));
12006   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
12007   ins_encode %{
12008     int vector_len = 2;
12009     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12010   %}
12011   ins_pipe( pipe_slow );
12012 %}
12013 
12014 // Longs vector add
12015 instruct vadd2L(vecX dst, vecX src) %{
12016   predicate(n->as_Vector()->length() == 2);
12017   match(Set dst (AddVL dst src));
12018   format %{ "paddq   $dst,$src\t! add packed2L" %}
12019   ins_encode %{
12020     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
12021   %}
12022   ins_pipe( pipe_slow );
12023 %}
12024 
12025 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
12026   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12027   match(Set dst (AddVL src1 src2));
12028   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
12029   ins_encode %{
12030     int vector_len = 0;
12031     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12032   %}
12033   ins_pipe( pipe_slow );
12034 %}
12035 
12036 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
12037   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12038   match(Set dst (AddVL src (LoadVector mem)));
12039   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
12040   ins_encode %{
12041     int vector_len = 0;
12042     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12043   %}
12044   ins_pipe( pipe_slow );
12045 %}
12046 
12047 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
12048   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12049   match(Set dst (AddVL src1 src2));
12050   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
12051   ins_encode %{
12052     int vector_len = 1;
12053     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12054   %}
12055   ins_pipe( pipe_slow );
12056 %}
12057 
12058 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
12059   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12060   match(Set dst (AddVL src (LoadVector mem)));
12061   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
12062   ins_encode %{
12063     int vector_len = 1;
12064     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12065   %}
12066   ins_pipe( pipe_slow );
12067 %}
12068 
12069 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
12070   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12071   match(Set dst (AddVL src1 src2));
12072   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
12073   ins_encode %{
12074     int vector_len = 2;
12075     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12076   %}
12077   ins_pipe( pipe_slow );
12078 %}
12079 
12080 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
12081   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12082   match(Set dst (AddVL src (LoadVector mem)));
12083   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
12084   ins_encode %{
12085     int vector_len = 2;
12086     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12087   %}
12088   ins_pipe( pipe_slow );
12089 %}
12090 
12091 // Floats vector add
12092 instruct vadd2F(vecD dst, vecD src) %{
12093   predicate(n->as_Vector()->length() == 2);
12094   match(Set dst (AddVF dst src));
12095   format %{ "addps   $dst,$src\t! add packed2F" %}
12096   ins_encode %{
12097     __ addps($dst$$XMMRegister, $src$$XMMRegister);
12098   %}
12099   ins_pipe( pipe_slow );
12100 %}
12101 
12102 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
12103   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12104   match(Set dst (AddVF src1 src2));
12105   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
12106   ins_encode %{
12107     int vector_len = 0;
12108     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12109   %}
12110   ins_pipe( pipe_slow );
12111 %}
12112 
12113 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
12114   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12115   match(Set dst (AddVF src (LoadVector mem)));
12116   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
12117   ins_encode %{
12118     int vector_len = 0;
12119     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12120   %}
12121   ins_pipe( pipe_slow );
12122 %}
12123 
12124 instruct vadd4F(vecX dst, vecX src) %{
12125   predicate(n->as_Vector()->length() == 4);
12126   match(Set dst (AddVF dst src));
12127   format %{ "addps   $dst,$src\t! add packed4F" %}
12128   ins_encode %{
12129     __ addps($dst$$XMMRegister, $src$$XMMRegister);
12130   %}
12131   ins_pipe( pipe_slow );
12132 %}
12133 
12134 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
12135   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12136   match(Set dst (AddVF src1 src2));
12137   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
12138   ins_encode %{
12139     int vector_len = 0;
12140     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12141   %}
12142   ins_pipe( pipe_slow );
12143 %}
12144 
12145 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
12146   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12147   match(Set dst (AddVF src (LoadVector mem)));
12148   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
12149   ins_encode %{
12150     int vector_len = 0;
12151     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12152   %}
12153   ins_pipe( pipe_slow );
12154 %}
12155 
12156 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
12157   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12158   match(Set dst (AddVF src1 src2));
12159   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
12160   ins_encode %{
12161     int vector_len = 1;
12162     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12163   %}
12164   ins_pipe( pipe_slow );
12165 %}
12166 
12167 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
12168   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12169   match(Set dst (AddVF src (LoadVector mem)));
12170   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
12171   ins_encode %{
12172     int vector_len = 1;
12173     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12174   %}
12175   ins_pipe( pipe_slow );
12176 %}
12177 
12178 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
12179   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12180   match(Set dst (AddVF src1 src2));
12181   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
12182   ins_encode %{
12183     int vector_len = 2;
12184     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12185   %}
12186   ins_pipe( pipe_slow );
12187 %}
12188 
12189 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
12190   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12191   match(Set dst (AddVF src (LoadVector mem)));
12192   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
12193   ins_encode %{
12194     int vector_len = 2;
12195     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12196   %}
12197   ins_pipe( pipe_slow );
12198 %}
12199 
12200 // Doubles vector add
12201 instruct vadd2D(vecX dst, vecX src) %{
12202   predicate(n->as_Vector()->length() == 2);
12203   match(Set dst (AddVD dst src));
12204   format %{ "addpd   $dst,$src\t! add packed2D" %}
12205   ins_encode %{
12206     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
12207   %}
12208   ins_pipe( pipe_slow );
12209 %}
12210 
12211 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
12212   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12213   match(Set dst (AddVD src1 src2));
12214   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
12215   ins_encode %{
12216     int vector_len = 0;
12217     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12218   %}
12219   ins_pipe( pipe_slow );
12220 %}
12221 
12222 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
12223   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12224   match(Set dst (AddVD src (LoadVector mem)));
12225   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
12226   ins_encode %{
12227     int vector_len = 0;
12228     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12229   %}
12230   ins_pipe( pipe_slow );
12231 %}
12232 
12233 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
12234   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12235   match(Set dst (AddVD src1 src2));
12236   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
12237   ins_encode %{
12238     int vector_len = 1;
12239     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12240   %}
12241   ins_pipe( pipe_slow );
12242 %}
12243 
12244 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
12245   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12246   match(Set dst (AddVD src (LoadVector mem)));
12247   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
12248   ins_encode %{
12249     int vector_len = 1;
12250     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12251   %}
12252   ins_pipe( pipe_slow );
12253 %}
12254 
12255 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
12256   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12257   match(Set dst (AddVD src1 src2));
12258   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
12259   ins_encode %{
12260     int vector_len = 2;
12261     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12262   %}
12263   ins_pipe( pipe_slow );
12264 %}
12265 
12266 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
12267   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12268   match(Set dst (AddVD src (LoadVector mem)));
12269   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
12270   ins_encode %{
12271     int vector_len = 2;
12272     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12273   %}
12274   ins_pipe( pipe_slow );
12275 %}
12276 
12277 // --------------------------------- SUB --------------------------------------
12278 
12279 // Bytes vector sub
12280 instruct vsub4B(vecS dst, vecS src) %{
12281   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12282   match(Set dst (SubVB dst src));
12283   format %{ "psubb   $dst,$src\t! sub packed4B" %}
12284   ins_encode %{
12285     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12286   %}
12287   ins_pipe( pipe_slow );
12288 %}
12289 
12290 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
12291   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
12292   match(Set dst (SubVB src1 src2));
12293   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
12294   ins_encode %{
12295     int vector_len = 0;
12296     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12297   %}
12298   ins_pipe( pipe_slow );
12299 %}
12300 
12301 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
12302   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
12303   match(Set dst (SubVB src1 src2));
12304   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
12305   ins_encode %{
12306     int vector_len = 0;
12307     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12308   %}
12309   ins_pipe( pipe_slow );
12310 %}
12311 
12312 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
12313   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
12314   match(Set dst (SubVB dst src2));
12315   effect(TEMP src1);
12316   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
12317   ins_encode %{
12318     int vector_len = 0;
12319     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12320   %}
12321   ins_pipe( pipe_slow );
12322 %}
12323 
12324 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
12325   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
12326   match(Set dst (SubVB src (LoadVector mem)));
12327   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
12328   ins_encode %{
12329     int vector_len = 0;
12330     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12331   %}
12332   ins_pipe( pipe_slow );
12333 %}
12334 
12335 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
12336   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
12337   match(Set dst (SubVB src (LoadVector mem)));
12338   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
12339   ins_encode %{
12340     int vector_len = 0;
12341     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12342   %}
12343   ins_pipe( pipe_slow );
12344 %}
12345 
12346 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
12347   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
12348   match(Set dst (SubVB dst (LoadVector mem)));
12349   effect(TEMP src);
12350   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
12351   ins_encode %{
12352     int vector_len = 0;
12353     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12354   %}
12355   ins_pipe( pipe_slow );
12356 %}
12357 
12358 instruct vsub8B(vecD dst, vecD src) %{
12359   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12360   match(Set dst (SubVB dst src));
12361   format %{ "psubb   $dst,$src\t! sub packed8B" %}
12362   ins_encode %{
12363     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12364   %}
12365   ins_pipe( pipe_slow );
12366 %}
12367 
12368 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
12369   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
12370   match(Set dst (SubVB src1 src2));
12371   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
12372   ins_encode %{
12373     int vector_len = 0;
12374     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12375   %}
12376   ins_pipe( pipe_slow );
12377 %}
12378 
12379 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
12380   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
12381   match(Set dst (SubVB src1 src2));
12382   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
12383   ins_encode %{
12384     int vector_len = 0;
12385     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12386   %}
12387   ins_pipe( pipe_slow );
12388 %}
12389 
12390 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
12391   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
12392   match(Set dst (SubVB dst src2));
12393   effect(TEMP src1);
12394   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
12395   ins_encode %{
12396     int vector_len = 0;
12397     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12398   %}
12399   ins_pipe( pipe_slow );
12400 %}
12401 
12402 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
12403   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
12404   match(Set dst (SubVB src (LoadVector mem)));
12405   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
12406   ins_encode %{
12407     int vector_len = 0;
12408     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12409   %}
12410   ins_pipe( pipe_slow );
12411 %}
12412 
12413 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
12414   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
12415   match(Set dst (SubVB src (LoadVector mem)));
12416   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
12417   ins_encode %{
12418     int vector_len = 0;
12419     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12420   %}
12421   ins_pipe( pipe_slow );
12422 %}
12423 
12424 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
12425   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
12426   match(Set dst (SubVB dst (LoadVector mem)));
12427   effect(TEMP src);
12428   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
12429   ins_encode %{
12430     int vector_len = 0;
12431     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12432   %}
12433   ins_pipe( pipe_slow );
12434 %}
12435 
12436 instruct vsub16B(vecX dst, vecX src) %{
12437   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
12438   match(Set dst (SubVB dst src));
12439   format %{ "psubb   $dst,$src\t! sub packed16B" %}
12440   ins_encode %{
12441     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12442   %}
12443   ins_pipe( pipe_slow );
12444 %}
12445 
12446 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
12447   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
12448   match(Set dst (SubVB src1 src2));
12449   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
12450   ins_encode %{
12451     int vector_len = 0;
12452     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12453   %}
12454   ins_pipe( pipe_slow );
12455 %}
12456 
12457 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
12458   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
12459   match(Set dst (SubVB src1 src2));
12460   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
12461   ins_encode %{
12462     int vector_len = 0;
12463     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12464   %}
12465   ins_pipe( pipe_slow );
12466 %}
12467 
12468 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
12469   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
12470   match(Set dst (SubVB dst src2));
12471   effect(TEMP src1);
12472   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
12473   ins_encode %{
12474     int vector_len = 0;
12475     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12476   %}
12477   ins_pipe( pipe_slow );
12478 %}
12479 
12480 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
12481   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
12482   match(Set dst (SubVB src (LoadVector mem)));
12483   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
12484   ins_encode %{
12485     int vector_len = 0;
12486     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12487   %}
12488   ins_pipe( pipe_slow );
12489 %}
12490 
12491 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
12492   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
12493   match(Set dst (SubVB src (LoadVector mem)));
12494   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
12495   ins_encode %{
12496     int vector_len = 0;
12497     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12498   %}
12499   ins_pipe( pipe_slow );
12500 %}
12501 
12502 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
12503   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
12504   match(Set dst (SubVB dst (LoadVector mem)));
12505   effect(TEMP src);
12506   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
12507   ins_encode %{
12508     int vector_len = 0;
12509     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12510   %}
12511   ins_pipe( pipe_slow );
12512 %}
12513 
12514 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
12515   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
12516   match(Set dst (SubVB src1 src2));
12517   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
12518   ins_encode %{
12519     int vector_len = 1;
12520     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12521   %}
12522   ins_pipe( pipe_slow );
12523 %}
12524 
12525 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
12526   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12527   match(Set dst (SubVB src1 src2));
12528   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
12529   ins_encode %{
12530     int vector_len = 1;
12531     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12532   %}
12533   ins_pipe( pipe_slow );
12534 %}
12535 
12536 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
12537   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
12538   match(Set dst (SubVB dst src2));
12539   effect(TEMP src1);
12540   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
12541   ins_encode %{
12542     int vector_len = 1;
12543     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12544   %}
12545   ins_pipe( pipe_slow );
12546 %}
12547 
12548 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
12549   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
12550   match(Set dst (SubVB src (LoadVector mem)));
12551   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
12552   ins_encode %{
12553     int vector_len = 1;
12554     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12555   %}
12556   ins_pipe( pipe_slow );
12557 %}
12558 
12559 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
12560   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12561   match(Set dst (SubVB src (LoadVector mem)));
12562   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
12563   ins_encode %{
12564     int vector_len = 1;
12565     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12566   %}
12567   ins_pipe( pipe_slow );
12568 %}
12569 
12570 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
12571   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
12572   match(Set dst (SubVB dst (LoadVector mem)));
12573   effect(TEMP src);
12574   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
12575   ins_encode %{
12576     int vector_len = 1;
12577     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12578   %}
12579   ins_pipe( pipe_slow );
12580 %}
12581 
12582 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
12583   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
12584   match(Set dst (SubVB src1 src2));
12585   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
12586   ins_encode %{
12587     int vector_len = 2;
12588     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12589   %}
12590   ins_pipe( pipe_slow );
12591 %}
12592 
12593 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
12594   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
12595   match(Set dst (SubVB src (LoadVector mem)));
12596   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
12597   ins_encode %{
12598     int vector_len = 2;
12599     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12600   %}
12601   ins_pipe( pipe_slow );
12602 %}
12603 
12604 // Shorts/Chars vector sub
12605 instruct vsub2S(vecS dst, vecS src) %{
12606   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12607   match(Set dst (SubVS dst src));
12608   format %{ "psubw   $dst,$src\t! sub packed2S" %}
12609   ins_encode %{
12610     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12611   %}
12612   ins_pipe( pipe_slow );
12613 %}
12614 
12615 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
12616   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
12617   match(Set dst (SubVS src1 src2));
12618   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
12619   ins_encode %{
12620     int vector_len = 0;
12621     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12622   %}
12623   ins_pipe( pipe_slow );
12624 %}
12625 
12626 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
12627   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
12628   match(Set dst (SubVS src1 src2));
12629   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
12630   ins_encode %{
12631     int vector_len = 0;
12632     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12633   %}
12634   ins_pipe( pipe_slow );
12635 %}
12636 
12637 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
12638   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
12639   match(Set dst (SubVS dst src2));
12640   effect(TEMP src1);
12641   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
12642   ins_encode %{
12643     int vector_len = 0;
12644     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12645   %}
12646   ins_pipe( pipe_slow );
12647 %}
12648 
12649 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
12650   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
12651   match(Set dst (SubVS src (LoadVector mem)));
12652   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
12653   ins_encode %{
12654     int vector_len = 0;
12655     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12656   %}
12657   ins_pipe( pipe_slow );
12658 %}
12659 
12660 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
12661   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
12662   match(Set dst (SubVS src (LoadVector mem)));
12663   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
12664   ins_encode %{
12665     int vector_len = 0;
12666     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12667   %}
12668   ins_pipe( pipe_slow );
12669 %}
12670 
12671 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
12672   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
12673   match(Set dst (SubVS dst (LoadVector mem)));
12674   effect(TEMP src);
12675   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
12676   ins_encode %{
12677     int vector_len = 0;
12678     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12679   %}
12680   ins_pipe( pipe_slow );
12681 %}
12682 
12683 instruct vsub4S(vecD dst, vecD src) %{
12684   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12685   match(Set dst (SubVS dst src));
12686   format %{ "psubw   $dst,$src\t! sub packed4S" %}
12687   ins_encode %{
12688     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12689   %}
12690   ins_pipe( pipe_slow );
12691 %}
12692 
12693 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
12694   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
12695   match(Set dst (SubVS src1 src2));
12696   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
12697   ins_encode %{
12698     int vector_len = 0;
12699     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12700   %}
12701   ins_pipe( pipe_slow );
12702 %}
12703 
12704 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
12705   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
12706   match(Set dst (SubVS src1 src2));
12707   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
12708   ins_encode %{
12709     int vector_len = 0;
12710     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12711   %}
12712   ins_pipe( pipe_slow );
12713 %}
12714 
12715 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
12716   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
12717   match(Set dst (SubVS dst src2));
12718   effect(TEMP src1);
12719   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
12720   ins_encode %{
12721     int vector_len = 0;
12722     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12723   %}
12724   ins_pipe( pipe_slow );
12725 %}
12726 
12727 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
12728   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
12729   match(Set dst (SubVS src (LoadVector mem)));
12730   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
12731   ins_encode %{
12732     int vector_len = 0;
12733     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12734   %}
12735   ins_pipe( pipe_slow );
12736 %}
12737 
12738 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
12739   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
12740   match(Set dst (SubVS src (LoadVector mem)));
12741   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
12742   ins_encode %{
12743     int vector_len = 0;
12744     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12745   %}
12746   ins_pipe( pipe_slow );
12747 %}
12748 
12749 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
12750   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
12751   match(Set dst (SubVS dst (LoadVector mem)));
12752   effect(TEMP src);
12753   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
12754   ins_encode %{
12755     int vector_len = 0;
12756     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12757   %}
12758   ins_pipe( pipe_slow );
12759 %}
12760 
12761 instruct vsub8S(vecX dst, vecX src) %{
12762   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12763   match(Set dst (SubVS dst src));
12764   format %{ "psubw   $dst,$src\t! sub packed8S" %}
12765   ins_encode %{
12766     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12767   %}
12768   ins_pipe( pipe_slow );
12769 %}
12770 
12771 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
12772   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
12773   match(Set dst (SubVS src1 src2));
12774   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
12775   ins_encode %{
12776     int vector_len = 0;
12777     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12778   %}
12779   ins_pipe( pipe_slow );
12780 %}
12781 
12782 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
12783   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
12784   match(Set dst (SubVS src1 src2));
12785   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
12786   ins_encode %{
12787     int vector_len = 0;
12788     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12789   %}
12790   ins_pipe( pipe_slow );
12791 %}
12792 
12793 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
12794   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
12795   match(Set dst (SubVS dst src2));
12796   effect(TEMP src1);
12797   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
12798   ins_encode %{
12799     int vector_len = 0;
12800     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12801   %}
12802   ins_pipe( pipe_slow );
12803 %}
12804 
12805 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
12806   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
12807   match(Set dst (SubVS src (LoadVector mem)));
12808   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
12809   ins_encode %{
12810     int vector_len = 0;
12811     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12812   %}
12813   ins_pipe( pipe_slow );
12814 %}
12815 
12816 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
12817   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
12818   match(Set dst (SubVS src (LoadVector mem)));
12819   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
12820   ins_encode %{
12821     int vector_len = 0;
12822     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12823   %}
12824   ins_pipe( pipe_slow );
12825 %}
12826 
12827 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
12828   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
12829   match(Set dst (SubVS dst (LoadVector mem)));
12830   effect(TEMP src);
12831   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
12832   ins_encode %{
12833     int vector_len = 0;
12834     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12835   %}
12836   ins_pipe( pipe_slow );
12837 %}
12838 
12839 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
12840   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
12841   match(Set dst (SubVS src1 src2));
12842   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
12843   ins_encode %{
12844     int vector_len = 1;
12845     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12846   %}
12847   ins_pipe( pipe_slow );
12848 %}
12849 
12850 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
12851   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
12852   match(Set dst (SubVS src1 src2));
12853   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
12854   ins_encode %{
12855     int vector_len = 1;
12856     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12857   %}
12858   ins_pipe( pipe_slow );
12859 %}
12860 
12861 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
12862   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
12863   match(Set dst (SubVS dst src2));
12864   effect(TEMP src1);
12865   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
12866   ins_encode %{
12867     int vector_len = 1;
12868     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12869   %}
12870   ins_pipe( pipe_slow );
12871 %}
12872 
12873 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
12874   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
12875   match(Set dst (SubVS src (LoadVector mem)));
12876   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
12877   ins_encode %{
12878     int vector_len = 1;
12879     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12880   %}
12881   ins_pipe( pipe_slow );
12882 %}
12883 
12884 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
12885   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
12886   match(Set dst (SubVS src (LoadVector mem)));
12887   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
12888   ins_encode %{
12889     int vector_len = 1;
12890     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12891   %}
12892   ins_pipe( pipe_slow );
12893 %}
12894 
12895 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
12896   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
12897   match(Set dst (SubVS dst (LoadVector mem)));
12898    effect(TEMP src);
12899   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
12900   ins_encode %{
12901     int vector_len = 1;
12902     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12903   %}
12904   ins_pipe( pipe_slow );
12905 %}
12906 
12907 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
12908   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12909   match(Set dst (SubVS src1 src2));
12910   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
12911   ins_encode %{
12912     int vector_len = 2;
12913     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12914   %}
12915   ins_pipe( pipe_slow );
12916 %}
12917 
12918 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
12919   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12920   match(Set dst (SubVS src (LoadVector mem)));
12921   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
12922   ins_encode %{
12923     int vector_len = 2;
12924     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12925   %}
12926   ins_pipe( pipe_slow );
12927 %}
12928 
12929 // Integers vector sub
12930 instruct vsub2I(vecD dst, vecD src) %{
12931   predicate(n->as_Vector()->length() == 2);
12932   match(Set dst (SubVI dst src));
12933   format %{ "psubd   $dst,$src\t! sub packed2I" %}
12934   ins_encode %{
12935     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
12936   %}
12937   ins_pipe( pipe_slow );
12938 %}
12939 
12940 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
12941   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12942   match(Set dst (SubVI src1 src2));
12943   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
12944   ins_encode %{
12945     int vector_len = 0;
12946     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12947   %}
12948   ins_pipe( pipe_slow );
12949 %}
12950 
12951 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
12952   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12953   match(Set dst (SubVI src (LoadVector mem)));
12954   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
12955   ins_encode %{
12956     int vector_len = 0;
12957     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12958   %}
12959   ins_pipe( pipe_slow );
12960 %}
12961 
12962 instruct vsub4I(vecX dst, vecX src) %{
12963   predicate(n->as_Vector()->length() == 4);
12964   match(Set dst (SubVI dst src));
12965   format %{ "psubd   $dst,$src\t! sub packed4I" %}
12966   ins_encode %{
12967     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
12968   %}
12969   ins_pipe( pipe_slow );
12970 %}
12971 
12972 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
12973   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12974   match(Set dst (SubVI src1 src2));
12975   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
12976   ins_encode %{
12977     int vector_len = 0;
12978     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12979   %}
12980   ins_pipe( pipe_slow );
12981 %}
12982 
12983 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
12984   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12985   match(Set dst (SubVI src (LoadVector mem)));
12986   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
12987   ins_encode %{
12988     int vector_len = 0;
12989     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12990   %}
12991   ins_pipe( pipe_slow );
12992 %}
12993 
12994 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
12995   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12996   match(Set dst (SubVI src1 src2));
12997   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
12998   ins_encode %{
12999     int vector_len = 1;
13000     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13001   %}
13002   ins_pipe( pipe_slow );
13003 %}
13004 
13005 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
13006   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
13007   match(Set dst (SubVI src (LoadVector mem)));
13008   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
13009   ins_encode %{
13010     int vector_len = 1;
13011     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13012   %}
13013   ins_pipe( pipe_slow );
13014 %}
13015 
13016 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
13017   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13018   match(Set dst (SubVI src1 src2));
13019   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
13020   ins_encode %{
13021     int vector_len = 2;
13022     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13023   %}
13024   ins_pipe( pipe_slow );
13025 %}
13026 
13027 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
13028   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13029   match(Set dst (SubVI src (LoadVector mem)));
13030   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
13031   ins_encode %{
13032     int vector_len = 2;
13033     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13034   %}
13035   ins_pipe( pipe_slow );
13036 %}
13037 
13038 // Longs vector sub
13039 instruct vsub2L(vecX dst, vecX src) %{
13040   predicate(n->as_Vector()->length() == 2);
13041   match(Set dst (SubVL dst src));
13042   format %{ "psubq   $dst,$src\t! sub packed2L" %}
13043   ins_encode %{
13044     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
13045   %}
13046   ins_pipe( pipe_slow );
13047 %}
13048 
13049 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
13050   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13051   match(Set dst (SubVL src1 src2));
13052   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
13053   ins_encode %{
13054     int vector_len = 0;
13055     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13056   %}
13057   ins_pipe( pipe_slow );
13058 %}
13059 
13060 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
13061   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13062   match(Set dst (SubVL src (LoadVector mem)));
13063   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
13064   ins_encode %{
13065     int vector_len = 0;
13066     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13067   %}
13068   ins_pipe( pipe_slow );
13069 %}
13070 
13071 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
13072   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
13073   match(Set dst (SubVL src1 src2));
13074   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
13075   ins_encode %{
13076     int vector_len = 1;
13077     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13078   %}
13079   ins_pipe( pipe_slow );
13080 %}
13081 
13082 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
13083   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
13084   match(Set dst (SubVL src (LoadVector mem)));
13085   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
13086   ins_encode %{
13087     int vector_len = 1;
13088     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13089   %}
13090   ins_pipe( pipe_slow );
13091 %}
13092 
13093 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
13094   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13095   match(Set dst (SubVL src1 src2));
13096   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
13097   ins_encode %{
13098     int vector_len = 2;
13099     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13100   %}
13101   ins_pipe( pipe_slow );
13102 %}
13103 
13104 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
13105   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13106   match(Set dst (SubVL src (LoadVector mem)));
13107   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
13108   ins_encode %{
13109     int vector_len = 2;
13110     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13111   %}
13112   ins_pipe( pipe_slow );
13113 %}
13114 
13115 // Floats vector sub
13116 instruct vsub2F(vecD dst, vecD src) %{
13117   predicate(n->as_Vector()->length() == 2);
13118   match(Set dst (SubVF dst src));
13119   format %{ "subps   $dst,$src\t! sub packed2F" %}
13120   ins_encode %{
13121     __ subps($dst$$XMMRegister, $src$$XMMRegister);
13122   %}
13123   ins_pipe( pipe_slow );
13124 %}
13125 
13126 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
13127   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13128   match(Set dst (SubVF src1 src2));
13129   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
13130   ins_encode %{
13131     int vector_len = 0;
13132     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13133   %}
13134   ins_pipe( pipe_slow );
13135 %}
13136 
13137 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
13138   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13139   match(Set dst (SubVF src (LoadVector mem)));
13140   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
13141   ins_encode %{
13142     int vector_len = 0;
13143     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13144   %}
13145   ins_pipe( pipe_slow );
13146 %}
13147 
13148 instruct vsub4F(vecX dst, vecX src) %{
13149   predicate(n->as_Vector()->length() == 4);
13150   match(Set dst (SubVF dst src));
13151   format %{ "subps   $dst,$src\t! sub packed4F" %}
13152   ins_encode %{
13153     __ subps($dst$$XMMRegister, $src$$XMMRegister);
13154   %}
13155   ins_pipe( pipe_slow );
13156 %}
13157 
13158 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
13159   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13160   match(Set dst (SubVF src1 src2));
13161   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
13162   ins_encode %{
13163     int vector_len = 0;
13164     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13165   %}
13166   ins_pipe( pipe_slow );
13167 %}
13168 
13169 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
13170   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13171   match(Set dst (SubVF src (LoadVector mem)));
13172   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
13173   ins_encode %{
13174     int vector_len = 0;
13175     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13176   %}
13177   ins_pipe( pipe_slow );
13178 %}
13179 
13180 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
13181   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13182   match(Set dst (SubVF src1 src2));
13183   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
13184   ins_encode %{
13185     int vector_len = 1;
13186     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13187   %}
13188   ins_pipe( pipe_slow );
13189 %}
13190 
13191 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
13192   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13193   match(Set dst (SubVF src (LoadVector mem)));
13194   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
13195   ins_encode %{
13196     int vector_len = 1;
13197     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13198   %}
13199   ins_pipe( pipe_slow );
13200 %}
13201 
13202 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
13203   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13204   match(Set dst (SubVF src1 src2));
13205   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
13206   ins_encode %{
13207     int vector_len = 2;
13208     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13209   %}
13210   ins_pipe( pipe_slow );
13211 %}
13212 
13213 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
13214   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13215   match(Set dst (SubVF src (LoadVector mem)));
13216   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
13217   ins_encode %{
13218     int vector_len = 2;
13219     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13220   %}
13221   ins_pipe( pipe_slow );
13222 %}
13223 
13224 // Doubles vector sub
13225 instruct vsub2D(vecX dst, vecX src) %{
13226   predicate(n->as_Vector()->length() == 2);
13227   match(Set dst (SubVD dst src));
13228   format %{ "subpd   $dst,$src\t! sub packed2D" %}
13229   ins_encode %{
13230     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
13231   %}
13232   ins_pipe( pipe_slow );
13233 %}
13234 
13235 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
13236   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13237   match(Set dst (SubVD src1 src2));
13238   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
13239   ins_encode %{
13240     int vector_len = 0;
13241     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13242   %}
13243   ins_pipe( pipe_slow );
13244 %}
13245 
13246 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
13247   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13248   match(Set dst (SubVD src (LoadVector mem)));
13249   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
13250   ins_encode %{
13251     int vector_len = 0;
13252     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13253   %}
13254   ins_pipe( pipe_slow );
13255 %}
13256 
13257 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
13258   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13259   match(Set dst (SubVD src1 src2));
13260   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
13261   ins_encode %{
13262     int vector_len = 1;
13263     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13264   %}
13265   ins_pipe( pipe_slow );
13266 %}
13267 
13268 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
13269   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13270   match(Set dst (SubVD src (LoadVector mem)));
13271   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
13272   ins_encode %{
13273     int vector_len = 1;
13274     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13275   %}
13276   ins_pipe( pipe_slow );
13277 %}
13278 
13279 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
13280   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13281   match(Set dst (SubVD src1 src2));
13282   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
13283   ins_encode %{
13284     int vector_len = 2;
13285     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13286   %}
13287   ins_pipe( pipe_slow );
13288 %}
13289 
13290 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
13291   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13292   match(Set dst (SubVD src (LoadVector mem)));
13293   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
13294   ins_encode %{
13295     int vector_len = 2;
13296     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13297   %}
13298   ins_pipe( pipe_slow );
13299 %}
13300 
13301 // --------------------------------- MUL --------------------------------------
13302 
13303 // Byte vector mul
13304 
13305 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp2, vecS tmp) %{
13306   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
13307   match(Set dst (MulVB src1 src2));
13308   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13309   format %{"pmovsxbw  $tmp,$src1\n\t"
13310            "pmovsxbw  $tmp2,$src2\n\t"
13311            "pmullw    $tmp,$tmp2\n\t"
13312            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13313            "pand      $tmp,$tmp2\n\t"
13314            "packuswb  $tmp,$tmp\n\t"
13315            "movss     $dst,$tmp\t! mul packed4B" %}
13316   ins_encode %{
13317     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13318     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13319     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13320     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13321     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
13322     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
13323     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
13324   %}
13325   ins_pipe( pipe_slow );
13326 %}
13327 
13328 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp2, vecD tmp) %{
13329   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
13330   match(Set dst (MulVB src1 src2));
13331   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13332   format %{"pmovsxbw  $tmp,$src1\n\t"
13333            "pmovsxbw  $tmp2,$src2\n\t"
13334            "pmullw    $tmp,$tmp2\n\t"
13335            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13336            "pand      $tmp,$tmp2\n\t"
13337            "packuswb  $tmp,$tmp\n\t"
13338            "movsd     $dst,$tmp\t! mul packed8B" %}
13339   ins_encode %{
13340     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13341     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13342     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13343     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13344     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
13345     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
13346     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
13347   %}
13348   ins_pipe( pipe_slow );
13349 %}
13350 
13351 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp3, vecX tmp2, vecX tmp) %{
13352   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
13353   match(Set dst (MulVB src1 src2));
13354   effect(TEMP tmp3, TEMP tmp2, TEMP tmp);
13355   format %{"pmovsxbw  $tmp,$src1\n\t"
13356            "pmovsxbw  $tmp2,$src2\n\t"
13357            "pmullw    $tmp,$tmp2\n\t"
13358            "pshufd    $tmp2,$src1\n\t"
13359            "pshufd    $tmp3,$src2\n\t"
13360            "pmovsxbw  $tmp2,$tmp2\n\t"
13361            "pmovsxbw  $tmp3,$tmp3\n\t"
13362            "pmullw    $tmp2,$tmp3\n\t"
13363            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
13364            "pand      $tmp,$tmp3\n\t"
13365            "pand      $tmp2,$tmp3\n\t"
13366            "packuswb  $tmp,$tmp2\n\t"
13367            "movdqu    $dst,$tmp \n\t! mul packed16B" %}
13368   ins_encode %{
13369     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13370     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13371     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13372     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 238);
13373     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 238);
13374     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
13375     __ pmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister);
13376     __ pmullw($tmp2$$XMMRegister, $tmp3$$XMMRegister);
13377     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13378     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
13379     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
13380     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
13381     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
13382   %}
13383   ins_pipe( pipe_slow );
13384 %}
13385 
13386 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecY tmp2, vecY tmp) %{
13387   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13388   match(Set dst (MulVB src1 src2));
13389   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13390   format %{"vpmovsxbw  $tmp,$src1\n\t"
13391            "vpmovsxbw  $tmp2,$src2\n\t"
13392            "vpmullw    $tmp,$tmp2\n\t"
13393            "vmovdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13394            "vpand      $tmp,$tmp2\n\t"
13395            "vextracti128_high  $tmp2,$tmp\n\t"
13396            "vpackuswb  $dst,$tmp, $tmp2\n\t! mul packed16B" %}
13397   ins_encode %{
13398   int vector_len = 1;
13399     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
13400     __ vpmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
13401     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
13402     __ vmovdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13403     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
13404     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
13405     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
13406   %}
13407   ins_pipe( pipe_slow );
13408 %}
13409 
13410 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, vecY tmp3) %{
13411   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
13412   match(Set dst (MulVB src1 src2));
13413   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
13414   format %{"vextracti128_high  $tmp1,$src1\n\t"
13415            "vextracti128_high  $tmp3,$src2\n\t"
13416            "vpmovsxbw $tmp1,$tmp1\n\t"
13417            "vpmovsxbw $tmp3,$tmp3\n\t"
13418            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
13419            "vpmovsxbw $tmp2,$src1\n\t"
13420            "vpmovsxbw $tmp3,$src2\n\t"
13421            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
13422            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
13423            "vpbroadcastd $tmp3, $tmp3\n\t"
13424            "vpand $tmp2,$tmp2,$tmp3\n\t"
13425            "vpand $tmp1,$tmp1,$tmp3\n\t"
13426            "vpackuswb $dst,$tmp2,$tmp1\n\t"
13427            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
13428   ins_encode %{
13429     int vector_len = 1;
13430     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
13431     __ vextracti128_high($tmp3$$XMMRegister, $src2$$XMMRegister);
13432     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13433     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13434     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13435     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
13436     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
13437     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13438     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13439     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister);
13440     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13441     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13442     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13443     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
13444   %}
13445   ins_pipe( pipe_slow );
13446 %}
13447 
13448 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, vecZ tmp3, vecZ tmp4, vecZ tmp5, vecZ tmp6) %{
13449   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
13450   match(Set dst (MulVB src1 src2));
13451   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4);
13452   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
13453            "vextracti64x4_high  $tmp3,$src2\n\t"
13454            "vpmovsxbw $tmp1,$tmp1\n\t"
13455            "vpmovsxbw $tmp3,$tmp3\n\t"
13456            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
13457            "vpmovsxbw $tmp2,$src1\n\t"
13458            "vpmovsxbw $tmp3,$src2\n\t"
13459            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
13460            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
13461            "evpbroadcastd $tmp3, $tmp3\n\t"
13462            "evpandd $tmp1,$tmp1,$tmp3\n\t"
13463            "evpandd $tmp2,$tmp2,$tmp3\n\t"
13464            "vpackuswb $tmp1,$tmp2,$tmp1\n\t"
13465            "vextracti64x4_high  $tmp3,$tmp1\n\t"
13466            "vpermq $tmp3, $tmp3, 0x8D\n\t"
13467            "vpermq $tmp1, $tmp1, 0xD8\n\t"
13468            "vmovdqu  $tmp4,$tmp3\n\t"
13469            "vmovdqu  $tmp2,$tmp1\n\t"
13470            "vpblendd  $tmp3,$tmp3,$tmp1\n\t"
13471            "vpblendd  $tmp2,$tmp2,$tmp4\n\t"
13472            "vpermq $tmp2,$tmp2,0x4E\n\t"
13473            "vinserti64x4 $dst,$dst,$tmp3,0x00\n\t"
13474            "vinserti64x4 $dst,$dst,$tmp2,0x01\t! mul packed64B" %}
13475   ins_encode %{
13476     int vector_len = 2;
13477     KRegister ktmp = k1;
13478     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
13479     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
13480     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13481     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13482     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13483     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
13484     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
13485     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13486     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13487     __ evpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13488     __ evpandd($tmp1$$XMMRegister, ktmp, $tmp1$$XMMRegister, $tmp3$$XMMRegister, false, vector_len);
13489     __ evpandd($tmp2$$XMMRegister, ktmp, $tmp2$$XMMRegister, $tmp3$$XMMRegister, false, vector_len);
13490     __ vpackuswb($tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13491     __ vextracti64x4_high($tmp3$$XMMRegister, $tmp1$$XMMRegister);
13492     __ vpermq($tmp3$$XMMRegister, $tmp3$$XMMRegister, 0x8D, 1);
13493     __ vpermq($tmp1$$XMMRegister, $tmp1$$XMMRegister, 0xD8, 1);
13494     __ vmovdqu($tmp4$$XMMRegister, $tmp3$$XMMRegister);
13495     __ vmovdqu($tmp2$$XMMRegister, $tmp1$$XMMRegister);
13496     __ vpblendd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $tmp1$$XMMRegister, 0x0F, 1);
13497     __ vpblendd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp4$$XMMRegister, 0x0F, 1);
13498     __ vpermq($tmp2$$XMMRegister, $tmp2$$XMMRegister, 0x4E, 1);
13499     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp3$$XMMRegister, 0x00);
13500     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, 0x01);
13501   %}
13502   ins_pipe( pipe_slow );
13503 %}
13504 
13505 // Shorts/Chars vector mul
13506 instruct vmul2S(vecS dst, vecS src) %{
13507   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13508   match(Set dst (MulVS dst src));
13509   format %{ "pmullw $dst,$src\t! mul packed2S" %}
13510   ins_encode %{
13511     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13512   %}
13513   ins_pipe( pipe_slow );
13514 %}
13515 
13516 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
13517   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
13518   match(Set dst (MulVS src1 src2));
13519   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
13520   ins_encode %{
13521     int vector_len = 0;
13522     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13523   %}
13524   ins_pipe( pipe_slow );
13525 %}
13526 
13527 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
13528   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
13529   match(Set dst (MulVS src1 src2));
13530   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
13531   ins_encode %{
13532     int vector_len = 0;
13533     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13534   %}
13535   ins_pipe( pipe_slow );
13536 %}
13537 
13538 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
13539   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
13540   match(Set dst (MulVS dst src2));
13541   effect(TEMP src1);
13542   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
13543   ins_encode %{
13544     int vector_len = 0;
13545     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13546   %}
13547   ins_pipe( pipe_slow );
13548 %}
13549 
13550 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
13551   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
13552   match(Set dst (MulVS src (LoadVector mem)));
13553   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
13554   ins_encode %{
13555     int vector_len = 0;
13556     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13557   %}
13558   ins_pipe( pipe_slow );
13559 %}
13560 
13561 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
13562   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
13563   match(Set dst (MulVS src (LoadVector mem)));
13564   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
13565   ins_encode %{
13566     int vector_len = 0;
13567     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13568   %}
13569   ins_pipe( pipe_slow );
13570 %}
13571 
13572 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
13573   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
13574   match(Set dst (MulVS dst (LoadVector mem)));
13575   effect(TEMP src);
13576   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
13577   ins_encode %{
13578     int vector_len = 0;
13579     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13580   %}
13581   ins_pipe( pipe_slow );
13582 %}
13583 
13584 instruct vmul4S(vecD dst, vecD src) %{
13585   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13586   match(Set dst (MulVS dst src));
13587   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
13588   ins_encode %{
13589     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13590   %}
13591   ins_pipe( pipe_slow );
13592 %}
13593 
13594 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
13595   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
13596   match(Set dst (MulVS src1 src2));
13597   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
13598   ins_encode %{
13599     int vector_len = 0;
13600     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13601   %}
13602   ins_pipe( pipe_slow );
13603 %}
13604 
13605 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
13606   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
13607   match(Set dst (MulVS src1 src2));
13608   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
13609   ins_encode %{
13610     int vector_len = 0;
13611     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13612   %}
13613   ins_pipe( pipe_slow );
13614 %}
13615 
13616 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
13617   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
13618   match(Set dst (MulVS dst src2));
13619   effect(TEMP src1);
13620   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
13621   ins_encode %{
13622     int vector_len = 0;
13623     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13624   %}
13625   ins_pipe( pipe_slow );
13626 %}
13627 
13628 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
13629   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
13630   match(Set dst (MulVS src (LoadVector mem)));
13631   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
13632   ins_encode %{
13633     int vector_len = 0;
13634     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13635   %}
13636   ins_pipe( pipe_slow );
13637 %}
13638 
13639 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
13640   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
13641   match(Set dst (MulVS src (LoadVector mem)));
13642   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
13643   ins_encode %{
13644     int vector_len = 0;
13645     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13646   %}
13647   ins_pipe( pipe_slow );
13648 %}
13649 
13650 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
13651   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
13652   match(Set dst (MulVS dst (LoadVector mem)));
13653   effect(TEMP src);
13654   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
13655   ins_encode %{
13656     int vector_len = 0;
13657     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13658   %}
13659   ins_pipe( pipe_slow );
13660 %}
13661 
13662 instruct vmul8S(vecX dst, vecX src) %{
13663   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
13664   match(Set dst (MulVS dst src));
13665   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
13666   ins_encode %{
13667     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13668   %}
13669   ins_pipe( pipe_slow );
13670 %}
13671 
13672 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
13673   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
13674   match(Set dst (MulVS src1 src2));
13675   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
13676   ins_encode %{
13677     int vector_len = 0;
13678     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13679   %}
13680   ins_pipe( pipe_slow );
13681 %}
13682 
13683 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
13684   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
13685   match(Set dst (MulVS src1 src2));
13686   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
13687   ins_encode %{
13688     int vector_len = 0;
13689     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13690   %}
13691   ins_pipe( pipe_slow );
13692 %}
13693 
13694 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
13695   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
13696   match(Set dst (MulVS dst src2));
13697   effect(TEMP src1);
13698   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
13699   ins_encode %{
13700     int vector_len = 0;
13701     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13702   %}
13703   ins_pipe( pipe_slow );
13704 %}
13705 
13706 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
13707   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
13708   match(Set dst (MulVS src (LoadVector mem)));
13709   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
13710   ins_encode %{
13711     int vector_len = 0;
13712     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13713   %}
13714   ins_pipe( pipe_slow );
13715 %}
13716 
13717 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
13718   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
13719   match(Set dst (MulVS src (LoadVector mem)));
13720   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
13721   ins_encode %{
13722     int vector_len = 0;
13723     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13724   %}
13725   ins_pipe( pipe_slow );
13726 %}
13727 
13728 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
13729   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
13730   match(Set dst (MulVS dst (LoadVector mem)));
13731   effect(TEMP src);
13732   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
13733   ins_encode %{
13734     int vector_len = 0;
13735     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13736   %}
13737   ins_pipe( pipe_slow );
13738 %}
13739 
13740 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
13741   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
13742   match(Set dst (MulVS src1 src2));
13743   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
13744   ins_encode %{
13745     int vector_len = 1;
13746     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13747   %}
13748   ins_pipe( pipe_slow );
13749 %}
13750 
13751 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
13752   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
13753   match(Set dst (MulVS src1 src2));
13754   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
13755   ins_encode %{
13756     int vector_len = 1;
13757     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13758   %}
13759   ins_pipe( pipe_slow );
13760 %}
13761 
13762 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
13763   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
13764   match(Set dst (MulVS dst src2));
13765   effect(TEMP src1);
13766   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
13767   ins_encode %{
13768     int vector_len = 1;
13769     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13770   %}
13771   ins_pipe( pipe_slow );
13772 %}
13773 
13774 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
13775   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
13776   match(Set dst (MulVS src (LoadVector mem)));
13777   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
13778   ins_encode %{
13779     int vector_len = 1;
13780     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13781   %}
13782   ins_pipe( pipe_slow );
13783 %}
13784 
13785 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
13786   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
13787   match(Set dst (MulVS src (LoadVector mem)));
13788   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
13789   ins_encode %{
13790     int vector_len = 1;
13791     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13792   %}
13793   ins_pipe( pipe_slow );
13794 %}
13795 
13796 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
13797   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
13798   match(Set dst (MulVS dst (LoadVector mem)));
13799   effect(TEMP src);
13800   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
13801   ins_encode %{
13802     int vector_len = 1;
13803     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13804   %}
13805   ins_pipe( pipe_slow );
13806 %}
13807 
13808 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
13809   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
13810   match(Set dst (MulVS src1 src2));
13811   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
13812   ins_encode %{
13813     int vector_len = 2;
13814     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13815   %}
13816   ins_pipe( pipe_slow );
13817 %}
13818 
13819 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
13820   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
13821   match(Set dst (MulVS src (LoadVector mem)));
13822   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
13823   ins_encode %{
13824     int vector_len = 2;
13825     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13826   %}
13827   ins_pipe( pipe_slow );
13828 %}
13829 
13830 // Integers vector mul (sse4_1)
13831 instruct vmul2I(vecD dst, vecD src) %{
13832   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
13833   match(Set dst (MulVI dst src));
13834   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
13835   ins_encode %{
13836     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
13837   %}
13838   ins_pipe( pipe_slow );
13839 %}
13840 
13841 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
13842   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13843   match(Set dst (MulVI src1 src2));
13844   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
13845   ins_encode %{
13846     int vector_len = 0;
13847     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13848   %}
13849   ins_pipe( pipe_slow );
13850 %}
13851 
13852 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
13853   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13854   match(Set dst (MulVI src (LoadVector mem)));
13855   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
13856   ins_encode %{
13857     int vector_len = 0;
13858     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13859   %}
13860   ins_pipe( pipe_slow );
13861 %}
13862 
13863 instruct vmul4I(vecX dst, vecX src) %{
13864   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
13865   match(Set dst (MulVI dst src));
13866   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
13867   ins_encode %{
13868     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
13869   %}
13870   ins_pipe( pipe_slow );
13871 %}
13872 
13873 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
13874   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13875   match(Set dst (MulVI src1 src2));
13876   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
13877   ins_encode %{
13878     int vector_len = 0;
13879     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13880   %}
13881   ins_pipe( pipe_slow );
13882 %}
13883 
13884 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
13885   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13886   match(Set dst (MulVI src (LoadVector mem)));
13887   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
13888   ins_encode %{
13889     int vector_len = 0;
13890     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13891   %}
13892   ins_pipe( pipe_slow );
13893 %}
13894 
13895 // Long vector mul
13896 
13897 instruct mul2L_reg(vecX dst, vecX src2, vecX tmp) %{
13898   predicate(UseSSE > 3 && n->as_Vector()->length() == 2 && VM_Version::supports_sse4_1());
13899   match(Set dst (MulVL dst src2));
13900   effect(TEMP dst, TEMP tmp);
13901   format %{ "pshufd $tmp,$src2, 177\n\t"
13902             "pmulld $tmp,$dst\n\t"
13903             "phaddd $tmp,$tmp\n\t"
13904             "pmovzxdq $tmp,$tmp\n\t"
13905             "psllq $tmp, 32\n\t"
13906             "pmuludq $dst,$src2\n\t"
13907             "paddq $dst,$tmp\n\t! mul packed2L" %}
13908 
13909   ins_encode %{
13910     int vector_len = 0;
13911     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
13912     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
13913     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
13914     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
13915     __ psllq($tmp$$XMMRegister, 32);
13916     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
13917     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
13918   %}
13919   ins_pipe( pipe_slow );
13920 %}
13921 
13922 instruct vmul2L_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp) %{
13923   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && VM_Version::supports_avx());
13924   match(Set dst (MulVL src1 src2));
13925   effect(TEMP tmp1, TEMP tmp);
13926   format %{ "vpshufd $tmp,$src2\n\t"
13927             "vpmulld $tmp,$src1,$tmp\n\t"
13928             "vphaddd $tmp,$tmp,$tmp\n\t"
13929             "vpmovzxdq $tmp,$tmp\n\t"
13930             "vpsllq $tmp,$tmp\n\t"
13931             "vpmuludq $tmp1,$src1,$src2\n\t"
13932             "vpaddq $dst,$tmp,$tmp1\t! mul packed2L" %}
13933   ins_encode %{
13934     int vector_len = 0;
13935     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
13936     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
13937     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13938     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13939     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
13940     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13941     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13942   %}
13943   ins_pipe( pipe_slow );
13944 %}
13945 
13946 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
13947   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
13948   match(Set dst (MulVL src1 src2));
13949   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
13950   ins_encode %{
13951     int vector_len = 0;
13952     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13953   %}
13954   ins_pipe( pipe_slow );
13955 %}
13956 
13957 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
13958   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
13959   match(Set dst (MulVL src (LoadVector mem)));
13960   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
13961   ins_encode %{
13962     int vector_len = 0;
13963     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13964   %}
13965   ins_pipe( pipe_slow );
13966 %}
13967 
13968 instruct vmul4L_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp, vecY tmp1,) %{
13969   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && VM_Version::supports_avx2());
13970   match(Set dst (MulVL src1 src2));
13971   effect(TEMP tmp1, TEMP tmp);
13972   format %{ "vpshufd $tmp,$src2\n\t"
13973             "vpmulld $tmp,$src1,$tmp\n\t"
13974             "vphaddd $tmp,$tmp,$tmp\n\t"
13975             "vpmovzxdq $tmp,$tmp\n\t"
13976             "vpsllq $tmp,$tmp\n\t"
13977             "vpmuludq $tmp1,$src1,$src2\n\t"
13978             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
13979   ins_encode %{
13980     int vector_len = 1;
13981     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
13982     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
13983     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
13984     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13985     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13986     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
13987     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13988     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13989   %}
13990   ins_pipe( pipe_slow );
13991 %}
13992 
13993 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
13994   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
13995   match(Set dst (MulVL src1 src2));
13996   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
13997   ins_encode %{
13998     int vector_len = 1;
13999     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14000   %}
14001   ins_pipe( pipe_slow );
14002 %}
14003 
14004 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
14005   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
14006   match(Set dst (MulVL src (LoadVector mem)));
14007   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
14008   ins_encode %{
14009     int vector_len = 1;
14010     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14011   %}
14012   ins_pipe( pipe_slow );
14013 %}
14014 
14015 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
14016   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
14017   match(Set dst (MulVL src1 src2));
14018   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
14019   ins_encode %{
14020     int vector_len = 2;
14021     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14022   %}
14023   ins_pipe( pipe_slow );
14024 %}
14025 
14026 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
14027   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
14028   match(Set dst (MulVL src (LoadVector mem)));
14029   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
14030   ins_encode %{
14031     int vector_len = 2;
14032     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14033   %}
14034   ins_pipe( pipe_slow );
14035 %}
14036 
14037 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
14038   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
14039   match(Set dst (MulVI src1 src2));
14040   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
14041   ins_encode %{
14042     int vector_len = 1;
14043     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14044   %}
14045   ins_pipe( pipe_slow );
14046 %}
14047 
14048 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
14049   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
14050   match(Set dst (MulVI src (LoadVector mem)));
14051   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
14052   ins_encode %{
14053     int vector_len = 1;
14054     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14055   %}
14056   ins_pipe( pipe_slow );
14057 %}
14058 
14059 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
14060   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14061   match(Set dst (MulVI src1 src2));
14062   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
14063   ins_encode %{
14064     int vector_len = 2;
14065     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14066   %}
14067   ins_pipe( pipe_slow );
14068 %}
14069 
14070 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
14071   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14072   match(Set dst (MulVI src (LoadVector mem)));
14073   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
14074   ins_encode %{
14075     int vector_len = 2;
14076     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14077   %}
14078   ins_pipe( pipe_slow );
14079 %}
14080 
14081 // Floats vector mul
14082 instruct vmul2F(vecD dst, vecD src) %{
14083   predicate(n->as_Vector()->length() == 2);
14084   match(Set dst (MulVF dst src));
14085   format %{ "mulps   $dst,$src\t! mul packed2F" %}
14086   ins_encode %{
14087     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
14088   %}
14089   ins_pipe( pipe_slow );
14090 %}
14091 
14092 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
14093   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14094   match(Set dst (MulVF src1 src2));
14095   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
14096   ins_encode %{
14097     int vector_len = 0;
14098     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14099   %}
14100   ins_pipe( pipe_slow );
14101 %}
14102 
14103 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
14104   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14105   match(Set dst (MulVF src (LoadVector mem)));
14106   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
14107   ins_encode %{
14108     int vector_len = 0;
14109     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14110   %}
14111   ins_pipe( pipe_slow );
14112 %}
14113 
14114 instruct vmul4F(vecX dst, vecX src) %{
14115   predicate(n->as_Vector()->length() == 4);
14116   match(Set dst (MulVF dst src));
14117   format %{ "mulps   $dst,$src\t! mul packed4F" %}
14118   ins_encode %{
14119     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
14120   %}
14121   ins_pipe( pipe_slow );
14122 %}
14123 
14124 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
14125   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14126   match(Set dst (MulVF src1 src2));
14127   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
14128   ins_encode %{
14129     int vector_len = 0;
14130     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14131   %}
14132   ins_pipe( pipe_slow );
14133 %}
14134 
14135 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
14136   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14137   match(Set dst (MulVF src (LoadVector mem)));
14138   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
14139   ins_encode %{
14140     int vector_len = 0;
14141     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14142   %}
14143   ins_pipe( pipe_slow );
14144 %}
14145 
14146 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
14147   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14148   match(Set dst (MulVF src1 src2));
14149   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
14150   ins_encode %{
14151     int vector_len = 1;
14152     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14153   %}
14154   ins_pipe( pipe_slow );
14155 %}
14156 
14157 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
14158   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14159   match(Set dst (MulVF src (LoadVector mem)));
14160   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
14161   ins_encode %{
14162     int vector_len = 1;
14163     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14164   %}
14165   ins_pipe( pipe_slow );
14166 %}
14167 
14168 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
14169   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14170   match(Set dst (MulVF src1 src2));
14171   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
14172   ins_encode %{
14173     int vector_len = 2;
14174     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14175   %}
14176   ins_pipe( pipe_slow );
14177 %}
14178 
14179 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
14180   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14181   match(Set dst (MulVF src (LoadVector mem)));
14182   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
14183   ins_encode %{
14184     int vector_len = 2;
14185     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14186   %}
14187   ins_pipe( pipe_slow );
14188 %}
14189 
14190 // Doubles vector mul
14191 instruct vmul2D(vecX dst, vecX src) %{
14192   predicate(n->as_Vector()->length() == 2);
14193   match(Set dst (MulVD dst src));
14194   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
14195   ins_encode %{
14196     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
14197   %}
14198   ins_pipe( pipe_slow );
14199 %}
14200 
14201 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
14202   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14203   match(Set dst (MulVD src1 src2));
14204   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
14205   ins_encode %{
14206     int vector_len = 0;
14207     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14208   %}
14209   ins_pipe( pipe_slow );
14210 %}
14211 
14212 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
14213   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14214   match(Set dst (MulVD src (LoadVector mem)));
14215   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
14216   ins_encode %{
14217     int vector_len = 0;
14218     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14219   %}
14220   ins_pipe( pipe_slow );
14221 %}
14222 
14223 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
14224   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14225   match(Set dst (MulVD src1 src2));
14226   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
14227   ins_encode %{
14228     int vector_len = 1;
14229     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14230   %}
14231   ins_pipe( pipe_slow );
14232 %}
14233 
14234 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
14235   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14236   match(Set dst (MulVD src (LoadVector mem)));
14237   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
14238   ins_encode %{
14239     int vector_len = 1;
14240     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14241   %}
14242   ins_pipe( pipe_slow );
14243 %}
14244 
14245 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
14246   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14247   match(Set dst (MulVD src1 src2));
14248   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
14249   ins_encode %{
14250     int vector_len = 2;
14251     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14252   %}
14253   ins_pipe( pipe_slow );
14254 %}
14255 
14256 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
14257   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14258   match(Set dst (MulVD src (LoadVector mem)));
14259   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
14260   ins_encode %{
14261     int vector_len = 2;
14262     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14263   %}
14264   ins_pipe( pipe_slow );
14265 %}
14266 
14267 instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
14268   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
14269   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
14270   effect(TEMP dst, USE src1, USE src2);
14271   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
14272             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
14273          %}
14274   ins_encode %{
14275     int vector_len = 1;
14276     int cond = (Assembler::Condition)($copnd$$cmpcode);
14277     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
14278     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14279   %}
14280   ins_pipe( pipe_slow );
14281 %}
14282 
14283 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
14284   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
14285   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
14286   effect(TEMP dst, USE src1, USE src2);
14287   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
14288             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
14289          %}
14290   ins_encode %{
14291     int vector_len = 1;
14292     int cond = (Assembler::Condition)($copnd$$cmpcode);
14293     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
14294     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14295   %}
14296   ins_pipe( pipe_slow );
14297 %}
14298 
14299 // --------------------------------- DIV --------------------------------------
14300 
14301 // Floats vector div
14302 instruct vdiv2F(vecD dst, vecD src) %{
14303   predicate(n->as_Vector()->length() == 2);
14304   match(Set dst (DivVF dst src));
14305   format %{ "divps   $dst,$src\t! div packed2F" %}
14306   ins_encode %{
14307     __ divps($dst$$XMMRegister, $src$$XMMRegister);
14308   %}
14309   ins_pipe( pipe_slow );
14310 %}
14311 
14312 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
14313   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14314   match(Set dst (DivVF src1 src2));
14315   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
14316   ins_encode %{
14317     int vector_len = 0;
14318     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14319   %}
14320   ins_pipe( pipe_slow );
14321 %}
14322 
14323 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
14324   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14325   match(Set dst (DivVF src (LoadVector mem)));
14326   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
14327   ins_encode %{
14328     int vector_len = 0;
14329     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14330   %}
14331   ins_pipe( pipe_slow );
14332 %}
14333 
14334 instruct vdiv4F(vecX dst, vecX src) %{
14335   predicate(n->as_Vector()->length() == 4);
14336   match(Set dst (DivVF dst src));
14337   format %{ "divps   $dst,$src\t! div packed4F" %}
14338   ins_encode %{
14339     __ divps($dst$$XMMRegister, $src$$XMMRegister);
14340   %}
14341   ins_pipe( pipe_slow );
14342 %}
14343 
14344 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
14345   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14346   match(Set dst (DivVF src1 src2));
14347   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
14348   ins_encode %{
14349     int vector_len = 0;
14350     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14351   %}
14352   ins_pipe( pipe_slow );
14353 %}
14354 
14355 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
14356   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14357   match(Set dst (DivVF src (LoadVector mem)));
14358   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
14359   ins_encode %{
14360     int vector_len = 0;
14361     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14362   %}
14363   ins_pipe( pipe_slow );
14364 %}
14365 
14366 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
14367   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14368   match(Set dst (DivVF src1 src2));
14369   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
14370   ins_encode %{
14371     int vector_len = 1;
14372     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14373   %}
14374   ins_pipe( pipe_slow );
14375 %}
14376 
14377 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
14378   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14379   match(Set dst (DivVF src (LoadVector mem)));
14380   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
14381   ins_encode %{
14382     int vector_len = 1;
14383     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14384   %}
14385   ins_pipe( pipe_slow );
14386 %}
14387 
14388 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
14389   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
14390   match(Set dst (DivVF src1 src2));
14391   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
14392   ins_encode %{
14393     int vector_len = 2;
14394     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14395   %}
14396   ins_pipe( pipe_slow );
14397 %}
14398 
14399 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
14400   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
14401   match(Set dst (DivVF src (LoadVector mem)));
14402   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
14403   ins_encode %{
14404     int vector_len = 2;
14405     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14406   %}
14407   ins_pipe( pipe_slow );
14408 %}
14409 
14410 // Doubles vector div
14411 instruct vdiv2D(vecX dst, vecX src) %{
14412   predicate(n->as_Vector()->length() == 2);
14413   match(Set dst (DivVD dst src));
14414   format %{ "divpd   $dst,$src\t! div packed2D" %}
14415   ins_encode %{
14416     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
14417   %}
14418   ins_pipe( pipe_slow );
14419 %}
14420 
14421 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
14422   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14423   match(Set dst (DivVD src1 src2));
14424   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
14425   ins_encode %{
14426     int vector_len = 0;
14427     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14428   %}
14429   ins_pipe( pipe_slow );
14430 %}
14431 
14432 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
14433   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14434   match(Set dst (DivVD src (LoadVector mem)));
14435   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
14436   ins_encode %{
14437     int vector_len = 0;
14438     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14439   %}
14440   ins_pipe( pipe_slow );
14441 %}
14442 
14443 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
14444   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14445   match(Set dst (DivVD src1 src2));
14446   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
14447   ins_encode %{
14448     int vector_len = 1;
14449     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14450   %}
14451   ins_pipe( pipe_slow );
14452 %}
14453 
14454 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
14455   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14456   match(Set dst (DivVD src (LoadVector mem)));
14457   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
14458   ins_encode %{
14459     int vector_len = 1;
14460     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14461   %}
14462   ins_pipe( pipe_slow );
14463 %}
14464 
14465 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
14466   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14467   match(Set dst (DivVD src1 src2));
14468   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
14469   ins_encode %{
14470     int vector_len = 2;
14471     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14472   %}
14473   ins_pipe( pipe_slow );
14474 %}
14475 
14476 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
14477   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14478   match(Set dst (DivVD src (LoadVector mem)));
14479   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
14480   ins_encode %{
14481     int vector_len = 2;
14482     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14483   %}
14484   ins_pipe( pipe_slow );
14485 %}
14486 
14487 // ------------------------------ Min ---------------------------------------
14488 // Byte vector Min
14489 instruct min8B_reg(vecD dst, vecD src1, vecD src2) %{
14490   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14491   match(Set dst (MinV src1 src2));
14492   effect(TEMP dst);
14493   format %{ "movdqu  $dst,$src1\n\t"
14494             "pminsb  $dst,$src2\t!  " %}
14495   ins_encode %{
14496     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14497     __ pminsb($dst$$XMMRegister, $src2$$XMMRegister);
14498   %}
14499   ins_pipe( pipe_slow );
14500 %}
14501 
14502 instruct min8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
14503   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14504   match(Set dst (MinV src1 src2));
14505   format %{ "vpminsb  $dst,$src1,$src2\t!  " %}
14506   ins_encode %{
14507     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14508   %}
14509   ins_pipe( pipe_slow );
14510 %}
14511 
14512 instruct min16B_reg(vecX dst, vecX src1, vecX src2) %{
14513   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14514   match(Set dst (MinV src1 src2));
14515   effect(TEMP dst);
14516   format %{ "movdqu  $dst,$src1\n\t"
14517             "pminsb  $dst,$src2\t!  " %}
14518   ins_encode %{
14519     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14520     __ pminsb($dst$$XMMRegister, $src2$$XMMRegister);
14521   %}
14522   ins_pipe( pipe_slow );
14523 %}
14524 
14525 instruct min16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
14526   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14527   match(Set dst (MinV src1 src2));
14528   format %{ "vpminsb    $dst,$src1,$src2\t! " %}
14529   ins_encode %{
14530     int vector_len = 0;
14531     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14532   %}
14533   ins_pipe( pipe_slow );
14534 %}
14535 
14536 instruct min32B_reg(vecY dst, vecY src1, vecY src2) %{
14537   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14538   match(Set dst (MinV src1 src2));
14539   format %{ "vpminsb    $dst,$src1,$src2\t! " %}
14540   ins_encode %{
14541     int vector_len = 1;
14542     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14543   %}
14544   ins_pipe( pipe_slow );
14545 %}
14546 
14547 instruct min64B_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14548   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14549   match(Set dst (MinV src1 src2));
14550   format %{ "vpminsb  $dst,$src1,$src2\t! " %}
14551   ins_encode %{
14552     int vector_len = 2;
14553     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14554   %}
14555   ins_pipe( pipe_slow );
14556 %}
14557 
14558 //Short vector Min
14559 instruct min4S_reg(vecD dst, vecD src1, vecD src2) %{
14560   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14561   match(Set dst (MinV src1 src2));
14562   effect(TEMP dst);
14563   format %{ "movsd   $dst,$src1\n\t"
14564             "pminsw  $dst,$src2\t! " %}
14565   ins_encode %{
14566     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14567     __ pminsw($dst$$XMMRegister, $src2$$XMMRegister);
14568   %}
14569   ins_pipe( pipe_slow );
14570 %}
14571 
14572 instruct min4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
14573   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14574   match(Set dst (MinV src1 src2));
14575   effect(TEMP dst);
14576   format %{ "vpminsw  $dst,$src1,$src2\t! " %}
14577   ins_encode %{
14578     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14579   %}
14580   ins_pipe( pipe_slow );
14581 %}
14582 
14583 instruct min8S_reg(vecX dst, vecX src1, vecX src2) %{
14584   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14585   match(Set dst (MinV src1 src2));
14586   effect(TEMP dst);
14587   format %{ "movdqu   $dst,$src1\n\t"
14588             "pminsw  $dst,$src2\t! " %}
14589   ins_encode %{
14590     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14591     __ pminsw($dst$$XMMRegister, $src2$$XMMRegister);
14592   %}
14593   ins_pipe( pipe_slow );
14594 %}
14595 
14596 instruct min8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
14597   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14598   match(Set dst (MinV src1 src2));
14599   format %{ "vpminsw    $dst,$src1,$src2\t! " %}
14600   ins_encode %{
14601     int vector_len = 0;
14602     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14603   %}
14604   ins_pipe( pipe_slow );
14605 %}
14606 
14607 instruct min16S_reg(vecY dst, vecY src1, vecY src2) %{
14608   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14609   match(Set dst (MinV src1 src2));
14610   format %{ "vpminsw    $dst,$src1,$src2\t! " %}
14611   ins_encode %{
14612     int vector_len = 1;
14613     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14614   %}
14615   ins_pipe( pipe_slow );
14616 %}
14617 
14618 instruct min32S_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14619   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14620   match(Set dst (MinV src1 src2));
14621   format %{ "vpminsw  $dst,$src1,$src2\t! " %}
14622   ins_encode %{
14623     int vector_len = 2;
14624     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14625   %}
14626   ins_pipe( pipe_slow );
14627 %}
14628 
14629 // Int vector Min
14630 instruct min2I_reg(vecD dst, vecD src1, vecD src2) %{
14631   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14632   match(Set dst (MinV src1 src2));
14633   effect(TEMP dst);
14634   format %{ "movsd   $dst,$src1\n\t"
14635             "pminsd  $dst,$src2\t! " %}
14636   ins_encode %{
14637     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14638     __ pminsd($dst$$XMMRegister, $src2$$XMMRegister);
14639   %}
14640   ins_pipe( pipe_slow );
14641 %}
14642 
14643 instruct min2I_reg_avx(vecD dst, vecD src1, vecD src2) %{
14644   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14645   match(Set dst (MinV src1 src2));
14646   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14647   ins_encode %{
14648     int vector_len = 0;
14649     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14650   %}
14651   ins_pipe( pipe_slow );
14652 %}
14653 
14654 instruct min4I_reg(vecX dst, vecX src1, vecX src2) %{
14655   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14656   match(Set dst (MinV src1 src2));
14657   effect(TEMP dst);
14658   format %{ "movdqu   $dst,$src1\n\t"
14659             "pminsd   $dst,$src2\t! " %}
14660   ins_encode %{
14661     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14662     __ pminsd($dst$$XMMRegister, $src2$$XMMRegister);
14663   %}
14664   ins_pipe( pipe_slow );
14665 %}
14666 
14667 instruct min4I_reg_avx(vecX dst, vecX src1, vecX src2) %{
14668   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14669   match(Set dst (MinV src1 src2));
14670   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14671   ins_encode %{
14672     int vector_len = 0;
14673     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14674   %}
14675   ins_pipe( pipe_slow );
14676 %}
14677 
14678 instruct min4I_reg_evex(vecX dst, vecX src1, vecX src2) %{
14679   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14680   match(Set dst (MinV src1 src2));
14681   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14682   ins_encode %{
14683     int vector_len = 0;
14684     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14685   %}
14686   ins_pipe( pipe_slow );
14687 %}
14688 
14689 instruct min8I_reg_avx(vecY dst, vecY src1, vecY src2) %{
14690   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14691   match(Set dst (MinV src1 src2));
14692   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14693   ins_encode %{
14694     int vector_len = 1;
14695     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14696   %}
14697   ins_pipe( pipe_slow );
14698 %}
14699 
14700 instruct min8I_reg_evex(vecY dst, vecY src1, vecY src2) %{
14701   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14702   match(Set dst (MinV src1 src2));
14703   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14704   ins_encode %{
14705     int vector_len = 1;
14706     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14707   %}
14708   ins_pipe( pipe_slow );
14709 %}
14710 
14711 instruct min16I_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14712   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14713   match(Set dst (MinV src1 src2));
14714   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14715   ins_encode %{
14716     int vector_len = 2;
14717     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14718   %}
14719   ins_pipe( pipe_slow );
14720 %}
14721 
14722 // Long vector Min
14723 instruct minL_reg(vecD dst, vecD src1, vecD src2, rxmm0 tmp) %{
14724   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14725   match(Set dst (MinV src1 src2));
14726   effect(TEMP dst, TEMP tmp);
14727   format %{ "movsd     $tmp,$src1\n\t"
14728             "movsd     $dst,$src1\n\t"
14729             "pcmpgtq   $tmp,$src2\n\t"
14730             "blendvpd  $dst,$src2\t! " %}
14731   ins_encode %{
14732     __ movsd($tmp$$XMMRegister, $src1$$XMMRegister);
14733     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14734     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14735     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14736   %}
14737   ins_pipe( pipe_slow );
14738 %}
14739 
14740 instruct min1L_reg_avx(vecD dst, vecD src1, vecD src2) %{
14741   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14742   match(Set dst (MinV src1 src2));
14743   effect(TEMP dst);
14744   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14745             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14746   ins_encode %{
14747     int vector_len = 0;
14748     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14749     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14750   %}
14751   ins_pipe( pipe_slow );
14752 %}
14753 
14754 instruct min2L_reg(vecX dst, vecX src1, vecX src2, rxmm0 tmp) %{
14755   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14756   match(Set dst (MinV src1 src2));
14757   effect(TEMP dst, TEMP tmp);
14758   format %{ "movdqu    $tmp,$src1\n\t"
14759             "movdqu    $dst,$src1\n\t"
14760             "pcmpgtq   $tmp,$src2\n\t"
14761             "blendvpd  $dst,$src2\t! " %}
14762   ins_encode %{
14763     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
14764     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14765     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14766     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14767   %}
14768   ins_pipe( pipe_slow );
14769 %}
14770 
14771 instruct min2L_reg_avx(vecX dst, vecX src1, vecX src2) %{
14772   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14773   match(Set dst (MinV src1 src2));
14774   effect(TEMP dst);
14775   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14776             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14777   ins_encode %{
14778     int vector_len = 0;
14779     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14780     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14781   %}
14782   ins_pipe( pipe_slow );
14783 %}
14784 
14785 instruct min4L_reg_avx(vecY dst, vecY src1, vecY src2) %{
14786   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14787   match(Set dst (MinV src1 src2));
14788   effect(TEMP dst);
14789   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14790             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14791   ins_encode %{
14792     int vector_len = 1;
14793     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14794     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14795   %}
14796   ins_pipe( pipe_slow );
14797 %}
14798 
14799 instruct min2L_reg_evex(vecX dst, vecX src1, vecX src2) %{
14800   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14801   match(Set dst (MinV src1 src2));
14802   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14803   ins_encode %{
14804     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14805   %}
14806   ins_pipe( pipe_slow );
14807 %}
14808 
14809 instruct min4L_reg_evex(vecY dst, vecY src1, vecY src2) %{
14810   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14811   match(Set dst (MinV src1 src2));
14812   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14813   ins_encode %{
14814     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 1);
14815   %}
14816   ins_pipe( pipe_slow );
14817 %}
14818 
14819 instruct min8L_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14820   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14821   match(Set dst (MinV src1 src2));
14822   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14823   ins_encode %{
14824     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 2);
14825   %}
14826   ins_pipe( pipe_slow );
14827 %}
14828 
14829 // Float vector Min
14830 instruct min2F_reg(vecD dst, vecD src1, vecD src2) %{
14831   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14832   match(Set dst (MinV src1 src2));
14833   effect(TEMP dst);
14834   format %{ "movsd  $dst,$src1\n\t"
14835             "minps  $dst,$src2\t! " %}
14836   ins_encode %{
14837     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14838     __ minps($dst$$XMMRegister, $src2$$XMMRegister);
14839   %}
14840   ins_pipe( pipe_slow );
14841 %}
14842 
14843 instruct min2F_reg_avx(vecD dst, vecD src1, vecD src2) %{
14844   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14845   match(Set dst (MinV src1 src2));
14846   format %{ "vminps  $dst,$src1,$src2\t! " %}
14847   ins_encode %{
14848     int vector_len = 0;
14849     __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14850   %}
14851   ins_pipe( pipe_slow );
14852 %}
14853 
14854 instruct min4F_reg(vecX dst, vecX src1, vecX src2) %{
14855   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14856   match(Set dst (MinV src1 src2));
14857   effect(TEMP dst);
14858   format %{ "movdqu  $dst,$src1\n\t"
14859             "minps   $dst,$src2\t! " %}
14860   ins_encode %{
14861     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14862     __ minps($dst$$XMMRegister, $src2$$XMMRegister);
14863   %}
14864   ins_pipe( pipe_slow );
14865 %}
14866 
14867 instruct min4F_reg_avx(vecX dst, vecX src1, vecX src2) %{
14868   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14869   match(Set dst (MinV src1 src2));
14870   format %{ "vminps  $dst,$src1,$src2\t! " %}
14871   ins_encode %{
14872     int vector_len = 0;
14873     __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14874   %}
14875   ins_pipe( pipe_slow );
14876 %}
14877 
14878 instruct min4F_reg_evex(vecX dst, vecX src1, vecX src2) %{
14879   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14880   match(Set dst (MinV src1 src2));
14881   format %{ "vminps  $dst,$src1,$src2\t! " %}
14882   ins_encode %{
14883     int vector_len = 0;
14884     __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14885   %}
14886   ins_pipe( pipe_slow );
14887 %}
14888 
14889 instruct min8F_reg_avx(vecY dst, vecY src1, vecY src2) %{
14890   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14891   match(Set dst (MinV src1 src2));
14892   format %{ "vminps  $dst,$src1,$src2\t! " %}
14893   ins_encode %{
14894     int vector_len = 1;
14895     __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14896   %}
14897   ins_pipe( pipe_slow );
14898 %}
14899 
14900 instruct min8F_reg_evex(vecY dst, vecY src1, vecY src2) %{
14901   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14902   match(Set dst (MinV src1 src2));
14903   format %{ "vminps  $dst,$src1,$src2\t! " %}
14904   ins_encode %{
14905     int vector_len = 1;
14906     __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14907   %}
14908   ins_pipe( pipe_slow );
14909 %}
14910 
14911 instruct min16F_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14912   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14913   match(Set dst (MinV src1 src2));
14914   format %{ "vminps  $dst,$src1,$src2\t! " %}
14915   ins_encode %{
14916     int vector_len = 2;
14917     __ vminps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14918   %}
14919   ins_pipe( pipe_slow );
14920 %}
14921 
14922 // Double vector Min
14923 instruct minD_reg(vecD dst, vecD src1, vecD src2) %{
14924   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14925   match(Set dst (MinV src1 src2));
14926   effect(TEMP dst);
14927   format %{ "movsd  $dst,$src1\n\t"
14928             "minpd  $dst,$src2\t! " %}
14929   ins_encode %{
14930     int vector_len = 0;
14931     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14932     __ minpd($dst$$XMMRegister, $src2$$XMMRegister);
14933   %}
14934   ins_pipe( pipe_slow );
14935 %}
14936 
14937 instruct min1D_reg_avx(vecD dst, vecD src1, vecD src2) %{
14938   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14939   match(Set dst (MinV src1 src2));
14940   format %{ "vminpd  $dst,$src1,$src2\t! " %}
14941   ins_encode %{
14942     int vector_len = 0;
14943     __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14944   %}
14945   ins_pipe( pipe_slow );
14946 %}
14947 
14948 instruct min2D_reg(vecX dst, vecX src1, vecX src2) %{
14949   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14950   match(Set dst (MinV src1 src2));
14951   effect(TEMP dst);
14952   format %{ "movdqu  $dst,$src1\n\t"
14953             "minpd   $dst,$src2\t! " %}
14954   ins_encode %{
14955     int vector_len = 0;
14956     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14957     __ minpd($dst$$XMMRegister, $src2$$XMMRegister);
14958   %}
14959   ins_pipe( pipe_slow );
14960 %}
14961 
14962 instruct min2D_reg_avx(vecX dst, vecX src1, vecX src2) %{
14963   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14964   match(Set dst (MinV src1 src2));
14965   format %{ "vminpd  $dst,$src1,$src2\t! " %}
14966   ins_encode %{
14967     int vector_len = 0;
14968     __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14969   %}
14970   ins_pipe( pipe_slow );
14971 %}
14972 
14973 instruct min2D_reg_evex(vecX dst, vecX src1, vecX src2) %{
14974   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14975   match(Set dst (MinV src1 src2));
14976   format %{ "vminpd  $dst,$src1,$src2\t! " %}
14977   ins_encode %{
14978     int vector_len = 0;
14979     __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14980   %}
14981   ins_pipe( pipe_slow );
14982 %}
14983 
14984 instruct min4D_reg_avx(vecY dst, vecY src1, vecY src2) %{
14985   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14986   match(Set dst (MinV src1 src2));
14987   format %{ "vminpd  $dst,$src1,$src2\t! " %}
14988   ins_encode %{
14989     int vector_len = 1;
14990     __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14991   %}
14992   ins_pipe( pipe_slow );
14993 %}
14994 
14995 instruct min4D_reg_evex(vecY dst, vecY src1, vecY src2) %{
14996   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14997   match(Set dst (MinV src1 src2));
14998   format %{ "vminpd  $dst,$src1,$src2\t! " %}
14999   ins_encode %{
15000     int vector_len = 1;
15001     __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15002   %}
15003   ins_pipe( pipe_slow );
15004 %}
15005 
15006 instruct min8D_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15007   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15008   match(Set dst (MinV src1 src2));
15009   format %{ "vminpd  $dst,$src1,$src2\t! " %}
15010   ins_encode %{
15011     int vector_len = 2;
15012     __ vminpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15013   %}
15014   ins_pipe( pipe_slow );
15015 %}
15016 
15017 // ------------------------------ Max ---------------------------------------
15018 // Byte vector Max
15019 instruct max8B_reg(vecD dst, vecD src1, vecD src2) %{
15020   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
15021   match(Set dst (MaxV src1 src2));
15022   effect(TEMP dst);
15023   format %{ "movsd   $dst,$src1\n\t"
15024             "pmaxsb  $dst,$src2\t! " %}
15025   ins_encode %{
15026     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
15027     __ pmaxsb($dst$$XMMRegister, $src2$$XMMRegister);
15028   %}
15029   ins_pipe( pipe_slow );
15030 %}
15031 
15032 instruct max8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
15033   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
15034   match(Set dst (MaxV src1 src2));
15035   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
15036   ins_encode %{
15037     int vector_len = 0;
15038     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15039   %}
15040   ins_pipe( pipe_slow );
15041 %}
15042 
15043 instruct max16B_reg(vecX dst, vecX src1, vecX src2) %{
15044   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
15045   match(Set dst (MaxV src1 src2));
15046   effect(TEMP dst);
15047   format %{ "movdqu  $dst,$src1\n\t"
15048             "pmaxsb  $dst,$src2\t! " %}
15049   ins_encode %{
15050     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15051     __ pmaxsb($dst$$XMMRegister, $src2$$XMMRegister);
15052   %}
15053   ins_pipe( pipe_slow );
15054 %}
15055 
15056 instruct max16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
15057   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
15058   match(Set dst (MaxV src1 src2));
15059   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
15060   ins_encode %{
15061     int vector_len = 0;
15062     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15063   %}
15064   ins_pipe( pipe_slow );
15065 %}
15066 
15067 instruct max32B_reg(vecY dst, vecY src1, vecY src2) %{
15068   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
15069   match(Set dst (MaxV src1 src2));
15070   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
15071   ins_encode %{
15072     int vector_len = 1;
15073     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15074   %}
15075   ins_pipe( pipe_slow );
15076 %}
15077 
15078 instruct max64B_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15079   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
15080   match(Set dst (MaxV src1 src2));
15081   format %{ "vpmaxsb  $dst,$src1,$src2\t! " %}
15082   ins_encode %{
15083     int vector_len = 2;
15084     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15085   %}
15086   ins_pipe( pipe_slow );
15087 %}
15088 
15089 //Short vector Max
15090 instruct max4S_reg(vecD dst, vecD src1, vecD src2) %{
15091   predicate(UseSSE > 1 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
15092   match(Set dst (MaxV src1 src2));
15093   effect(TEMP dst);
15094   format %{ "movsd   $dst,$src1\n\t"
15095             "pmaxsw  $dst,$src2\t! " %}
15096   ins_encode %{
15097     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
15098     __ pmaxsw($dst$$XMMRegister, $src2$$XMMRegister);
15099   %}
15100   ins_pipe( pipe_slow );
15101 %}
15102 
15103 instruct max4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
15104   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
15105   match(Set dst (MaxV src1 src2));
15106   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
15107   ins_encode %{
15108     int vector_len = 0;
15109     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15110   %}
15111   ins_pipe( pipe_slow );
15112 %}
15113 
15114 instruct max8S_reg(vecX dst, vecX src1, vecX src2) %{
15115   predicate(UseSSE > 1 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
15116   match(Set dst (MaxV src1 src2));
15117   effect(TEMP dst);
15118   format %{ "movdqu  $dst,$src1\n\t"
15119             "pmaxsw  $dst,$src2\t! " %}
15120   ins_encode %{
15121     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15122     __ pmaxsw($dst$$XMMRegister, $src2$$XMMRegister);
15123   %}
15124   ins_pipe( pipe_slow );
15125 %}
15126 
15127 instruct max8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
15128   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
15129   match(Set dst (MaxV src1 src2));
15130   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
15131   ins_encode %{
15132     int vector_len = 0;
15133     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15134   %}
15135   ins_pipe( pipe_slow );
15136 %}
15137 
15138 instruct max16S_reg(vecY dst, vecY src1, vecY src2) %{
15139   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
15140   match(Set dst (MaxV src1 src2));
15141   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
15142   ins_encode %{
15143     int vector_len = 1;
15144     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15145   %}
15146   ins_pipe( pipe_slow );
15147 %}
15148 
15149 instruct max32S_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15150   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
15151   match(Set dst (MaxV src1 src2));
15152   format %{ "vpmaxsw  $dst,$src1,$src2\t! " %}
15153   ins_encode %{
15154     int vector_len = 2;
15155     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15156   %}
15157   ins_pipe( pipe_slow );
15158 %}
15159 
15160 // Int vector Max
15161 instruct max2I_reg(vecD dst, vecD src1, vecD src2) %{
15162   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15163   match(Set dst (MaxV src1 src2));
15164   effect(TEMP dst);
15165   format %{ "movdqu  $dst,$src1\n\t"
15166             "pmaxsd  $dst,$src2\t! " %}
15167   ins_encode %{
15168     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15169     __ pmaxsd($dst$$XMMRegister, $src2$$XMMRegister);
15170   %}
15171   ins_pipe( pipe_slow );
15172 %}
15173 
15174 instruct max2I_reg_avx(vecD dst, vecD src1, vecD src2) %{
15175   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15176   match(Set dst (MaxV src1 src2));
15177   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
15178   ins_encode %{
15179     int vector_len = 0;
15180     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15181   %}
15182   ins_pipe( pipe_slow );
15183 %}
15184 
15185 instruct max4I_reg(vecX dst, vecX src1, vecX src2) %{
15186   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15187   match(Set dst (MaxV src1 src2));
15188   effect(TEMP dst);
15189   format %{ "movdqu  $dst,$src1\n\t"
15190             "pmaxsd  $dst,$src2\t! " %}
15191   ins_encode %{
15192     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15193     __ pmaxsd($dst$$XMMRegister, $src2$$XMMRegister);
15194   %}
15195   ins_pipe( pipe_slow );
15196 %}
15197 
15198 instruct max4I_reg_avx(vecX dst, vecX src1, vecX src2) %{
15199   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15200   match(Set dst (MaxV src1 src2));
15201   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
15202   ins_encode %{
15203     int vector_len = 0;
15204     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15205   %}
15206   ins_pipe( pipe_slow );
15207 %}
15208 
15209 instruct max4I_reg_evex(vecX dst, vecX src1, vecX src2) %{
15210   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15211   match(Set dst (MaxV src1 src2));
15212   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
15213   ins_encode %{
15214     int vector_len = 0;
15215     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15216   %}
15217   ins_pipe( pipe_slow );
15218 %}
15219 
15220 instruct max8I_reg_avx(vecY dst, vecY src1, vecY src2) %{
15221   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15222   match(Set dst (MaxV src1 src2));
15223   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
15224   ins_encode %{
15225     int vector_len = 1;
15226     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15227   %}
15228   ins_pipe( pipe_slow );
15229 %}
15230 
15231 instruct max8I_reg_evex(vecY dst, vecY src1, vecY src2) %{
15232   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15233   match(Set dst (MaxV src1 src2));
15234   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
15235   ins_encode %{
15236     int vector_len = 1;
15237     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15238   %}
15239   ins_pipe( pipe_slow );
15240 %}
15241 
15242 instruct max16I_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15243   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
15244   match(Set dst (MaxV src1 src2));
15245   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
15246   ins_encode %{
15247     int vector_len = 2;
15248     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15249   %}
15250   ins_pipe( pipe_slow );
15251 %}
15252 
15253 // Long Vector Max
15254 instruct maxL_reg(vecD dst, vecD src1, vecD src2, rxmm0 tmp) %{
15255   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15256   match(Set dst (MaxV src1 src2));
15257   effect(TEMP dst, TEMP tmp);
15258   format %{ "movsd     $tmp,$src1\n\t"
15259             "movsd     $dst,$src1\n\t"
15260             "pcmpgtq   $tmp,$src2\n\t"
15261             "blendvpd  $dst,$src2\t! " %}
15262   ins_encode %{
15263     __ movsd($tmp$$XMMRegister, $src1$$XMMRegister);
15264     __ movsd($dst$$XMMRegister, $src2$$XMMRegister);
15265     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
15266     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister);
15267   %}
15268   ins_pipe( pipe_slow );
15269 %}
15270 
15271 instruct max1L_reg_avx(vecD dst, vecD src1, vecD src2) %{
15272   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15273   match(Set dst (MaxV src1 src2));
15274   effect(TEMP dst);
15275   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
15276             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
15277   ins_encode %{
15278     int vector_len = 0;
15279     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15280     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
15281   %}
15282   ins_pipe( pipe_slow );
15283 %}
15284 
15285 instruct max2L_reg(vecX dst, vecX src1, vecX src2, rxmm0 tmp) %{
15286   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15287   match(Set dst (MaxV src1 src2));
15288   effect(TEMP dst, TEMP tmp);
15289   format %{ "movdqu    $tmp,$src2\n\t"
15290             "movdqu    $dst,$src1\n\t"
15291             "pcmpgtq   $tmp,$src1\n\t"
15292             "blendvpd  $dst,$src2\t! " %}
15293   ins_encode %{
15294     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
15295     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15296     __ pcmpgtq($tmp$$XMMRegister, $src1$$XMMRegister);
15297     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
15298   %}
15299   ins_pipe( pipe_slow );
15300 %}
15301 
15302 instruct max2L_reg_avx(vecX dst, vecX src1, vecX src2) %{
15303   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15304   match(Set dst (MaxV src1 src2));
15305   effect(TEMP dst);
15306   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
15307             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
15308   ins_encode %{
15309     int vector_len = 0;
15310     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15311     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
15312   %}
15313   ins_pipe( pipe_slow );
15314 %}
15315 
15316 instruct max2L_reg_evex(vecX dst, vecX src1, vecX src2) %{
15317   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15318   match(Set dst (MaxV src1 src2));
15319   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
15320   ins_encode %{
15321     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
15322   %}
15323   ins_pipe( pipe_slow );
15324 %}
15325 
15326 instruct max4L_reg_avx(vecY dst, vecY src1, vecY src2) %{
15327   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15328   match(Set dst (MaxV src1 src2));
15329   effect(TEMP dst);
15330   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
15331             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
15332   ins_encode %{
15333     int vector_len = 1;
15334     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15335     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
15336   %}
15337   ins_pipe( pipe_slow );
15338 %}
15339 
15340 instruct max4L_reg_evex(vecY dst, vecY src1, vecY src2) %{
15341   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15342   match(Set dst (MaxV src1 src2));
15343   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
15344   ins_encode %{
15345     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 1);
15346   %}
15347   ins_pipe( pipe_slow );
15348 %}
15349 
15350 instruct max8L_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15351   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15352   match(Set dst (MaxV src1 src2));
15353   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
15354   ins_encode %{
15355     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 2);
15356   %}
15357   ins_pipe( pipe_slow );
15358 %}
15359 
15360 // Float Vector Max
15361 instruct max2F_reg(vecD dst, vecD src1, vecD src2) %{
15362   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15363   match(Set dst (MaxV src1 src2));
15364   effect(TEMP dst);
15365   format %{ "movsd  $dst,$src1\n\t"
15366             "maxps  $dst,$src2\t! " %}
15367   ins_encode %{
15368     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
15369     __ maxps($dst$$XMMRegister, $src2$$XMMRegister);
15370   %}
15371   ins_pipe( pipe_slow );
15372 %}
15373 
15374 instruct max2F_reg_avx(vecD dst, vecD src1, vecD src2) %{
15375   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15376   match(Set dst (MaxV src1 src2));
15377   format %{ "vmaxps  $dst,$src1,$src2\t! " %}
15378   ins_encode %{
15379     int vector_len = 0;
15380     __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15381   %}
15382   ins_pipe( pipe_slow );
15383 %}
15384 
15385 instruct max4F_reg(vecX dst, vecX src1, vecX src2) %{
15386   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15387   match(Set dst (MaxV src1 src2));
15388   effect(TEMP dst);
15389   format %{ "movdqu  $dst,$src1\n\t"
15390             "maxps   $dst,$src2\t! " %}
15391   ins_encode %{
15392     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15393     __ maxps($dst$$XMMRegister, $src2$$XMMRegister);
15394   %}
15395   ins_pipe( pipe_slow );
15396 %}
15397 
15398 instruct max4F_reg_avx(vecX dst, vecX src1, vecX src2) %{
15399   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15400   match(Set dst (MaxV src1 src2));
15401   format %{ "vmaxps  $dst,$src1,$src2\t! " %}
15402   ins_encode %{
15403     int vector_len = 0;
15404     __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15405   %}
15406   ins_pipe( pipe_slow );
15407 %}
15408 
15409 instruct max4F_reg_evex(vecX dst, vecX src1, vecX src2) %{
15410   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15411   match(Set dst (MaxV src1 src2));
15412   format %{ "vmaxps  $dst,$src1,$src2\t! " %}
15413   ins_encode %{
15414     int vector_len = 0;
15415     __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15416   %}
15417   ins_pipe( pipe_slow );
15418 %}
15419 
15420 instruct max8F_reg_avx(vecY dst, vecY src1, vecY src2) %{
15421   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15422   match(Set dst (MaxV src1 src2));
15423   format %{ "vmaxps  $dst,$src1,$src2\t! " %}
15424   ins_encode %{
15425     int vector_len = 1;
15426     __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15427   %}
15428   ins_pipe( pipe_slow );
15429 %}
15430 
15431 instruct max8F_reg_evex(vecY dst, vecY src1, vecY src2) %{
15432   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15433   match(Set dst (MaxV src1 src2));
15434   format %{ "vmaxps  $dst,$src1,$src2\t! " %}
15435   ins_encode %{
15436     int vector_len = 1;
15437     __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15438   %}
15439   ins_pipe( pipe_slow );
15440 %}
15441 
15442 instruct max16F_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15443   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15444   match(Set dst (MaxV src1 src2));
15445   format %{ "vmaxps  $dst,$src1,$src2\t! " %}
15446   ins_encode %{
15447     int vector_len = 2;
15448     __ vmaxps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15449   %}
15450   ins_pipe( pipe_slow );
15451 %}
15452 
15453 // Double Vector Max
15454 instruct maxD_reg(vecD dst, vecD src1, vecD src2) %{
15455   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15456   match(Set dst (MaxV src1 src2));
15457   effect(TEMP dst);
15458   format %{ "movsd  $dst,$src1\n\t"
15459             "maxpd  $dst,$src2\t! " %}
15460   ins_encode %{
15461     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
15462     __ maxpd($dst$$XMMRegister, $src2$$XMMRegister);
15463   %}
15464   ins_pipe( pipe_slow );
15465 %}
15466 
15467 instruct max1D_reg_avx(vecD dst, vecD src1, vecD src2) %{
15468   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15469   match(Set dst (MaxV src1 src2));
15470   format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
15471   ins_encode %{
15472     int vector_len = 0;
15473     __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15474   %}
15475   ins_pipe( pipe_slow );
15476 %}
15477 
15478 instruct max2D_reg(vecX dst, vecX src1, vecX src2) %{
15479   predicate(UseSSE > 0 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15480   match(Set dst (MaxV src1 src2));
15481   effect(TEMP dst);
15482   format %{ "movdqu  $dst,$src1\n\t"
15483             "maxpd   $dst,$src2\t! " %}
15484   ins_encode %{
15485     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
15486     __ maxpd($dst$$XMMRegister, $src2$$XMMRegister);
15487   %}
15488   ins_pipe( pipe_slow );
15489 %}
15490 
15491 instruct max2D_reg_avx(vecX dst, vecX src1, vecX src2) %{
15492   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15493   match(Set dst (MaxV src1 src2));
15494   format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
15495   ins_encode %{
15496     int vector_len = 0;
15497     __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15498   %}
15499   ins_pipe( pipe_slow );
15500 %}
15501 
15502 instruct max2D_reg_evex(vecX dst, vecX src1, vecX src2) %{
15503   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15504   match(Set dst (MaxV src1 src2));
15505   format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
15506   ins_encode %{
15507     int vector_len = 0;
15508     __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15509   %}
15510   ins_pipe( pipe_slow );
15511 %}
15512 
15513 instruct max4D_reg_avx(vecY dst, vecY src1, vecY src2) %{
15514   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15515   match(Set dst (MaxV src1 src2));
15516   format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
15517   ins_encode %{
15518     int vector_len = 1;
15519     __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15520   %}
15521   ins_pipe( pipe_slow );
15522 %}
15523 
15524 instruct max4D_reg_evex(vecY dst, vecY src1, vecY src2) %{
15525   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15526   match(Set dst (MaxV src1 src2));
15527   format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
15528   ins_encode %{
15529     int vector_len = 1;
15530     __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15531   %}
15532   ins_pipe( pipe_slow );
15533 %}
15534 
15535 instruct max8D_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15536   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15537   match(Set dst (MaxV src1 src2));
15538   format %{ "vmaxpd  $dst,$src1,$src2\t! " %}
15539   ins_encode %{
15540     int vector_len = 2;
15541     __ vmaxpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
15542   %}
15543   ins_pipe( pipe_slow );
15544 %}
15545 
15546 // ------------------------------ Shift ---------------------------------------
15547 
15548 // Left and right shift count vectors are the same on x86
15549 // (only lowest bits of xmm reg are used for count).
15550 instruct vshiftcnt(vecS dst, rRegI cnt) %{
15551   match(Set dst (LShiftCntV cnt));
15552   match(Set dst (RShiftCntV cnt));
15553   format %{ "movd    $dst,$cnt\t! load shift count" %}
15554   ins_encode %{
15555     __ movdl($dst$$XMMRegister, $cnt$$Register);
15556   %}
15557   ins_pipe( pipe_slow );
15558 %}
15559 
15560 // --------------------------------- Sqrt --------------------------------------
15561 
15562 // Floating point vector sqrt
15563 instruct vsqrt2D_reg(vecX dst, vecX src) %{
15564   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15565   match(Set dst (SqrtVD src));
15566   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
15567   ins_encode %{
15568     int vector_len = 0;
15569     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15570   %}
15571   ins_pipe( pipe_slow );
15572 %}
15573 
15574 instruct vsqrt2D_mem(vecX dst, memory mem) %{
15575   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15576   match(Set dst (SqrtVD (LoadVector mem)));
15577   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
15578   ins_encode %{
15579     int vector_len = 0;
15580     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15581   %}
15582   ins_pipe( pipe_slow );
15583 %}
15584 
15585 instruct vsqrt4D_reg(vecY dst, vecY src) %{
15586   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15587   match(Set dst (SqrtVD src));
15588   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
15589   ins_encode %{
15590     int vector_len = 1;
15591     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15592   %}
15593   ins_pipe( pipe_slow );
15594 %}
15595 
15596 instruct vsqrt4D_mem(vecY dst, memory mem) %{
15597   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15598   match(Set dst (SqrtVD (LoadVector mem)));
15599   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
15600   ins_encode %{
15601     int vector_len = 1;
15602     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15603   %}
15604   ins_pipe( pipe_slow );
15605 %}
15606 
15607 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
15608   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15609   match(Set dst (SqrtVD src));
15610   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
15611   ins_encode %{
15612     int vector_len = 2;
15613     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15614   %}
15615   ins_pipe( pipe_slow );
15616 %}
15617 
15618 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
15619   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15620   match(Set dst (SqrtVD (LoadVector mem)));
15621   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
15622   ins_encode %{
15623     int vector_len = 2;
15624     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15625   %}
15626   ins_pipe( pipe_slow );
15627 %}
15628 
15629 instruct vsqrt2F_reg(vecD dst, vecD src) %{
15630   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15631   match(Set dst (SqrtVF src));
15632   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
15633   ins_encode %{
15634     int vector_len = 0;
15635     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15636   %}
15637   ins_pipe( pipe_slow );
15638 %}
15639 
15640 instruct vsqrt2F_mem(vecD dst, memory mem) %{
15641   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15642   match(Set dst (SqrtVF (LoadVector mem)));
15643   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
15644   ins_encode %{
15645     int vector_len = 0;
15646     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15647   %}
15648   ins_pipe( pipe_slow );
15649 %}
15650 
15651 instruct vsqrt4F_reg(vecX dst, vecX src) %{
15652   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15653   match(Set dst (SqrtVF src));
15654   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
15655   ins_encode %{
15656     int vector_len = 0;
15657     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15658   %}
15659   ins_pipe( pipe_slow );
15660 %}
15661 
15662 instruct vsqrt4F_mem(vecX dst, memory mem) %{
15663   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15664   match(Set dst (SqrtVF (LoadVector mem)));
15665   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
15666   ins_encode %{
15667     int vector_len = 0;
15668     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15669   %}
15670   ins_pipe( pipe_slow );
15671 %}
15672 
15673 instruct vsqrt8F_reg(vecY dst, vecY src) %{
15674   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15675   match(Set dst (SqrtVF src));
15676   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
15677   ins_encode %{
15678     int vector_len = 1;
15679     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15680   %}
15681   ins_pipe( pipe_slow );
15682 %}
15683 
15684 instruct vsqrt8F_mem(vecY dst, memory mem) %{
15685   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15686   match(Set dst (SqrtVF (LoadVector mem)));
15687   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
15688   ins_encode %{
15689     int vector_len = 1;
15690     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15691   %}
15692   ins_pipe( pipe_slow );
15693 %}
15694 
15695 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
15696   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15697   match(Set dst (SqrtVF src));
15698   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
15699   ins_encode %{
15700     int vector_len = 2;
15701     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15702   %}
15703   ins_pipe( pipe_slow );
15704 %}
15705 
15706 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
15707   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15708   match(Set dst (SqrtVF (LoadVector mem)));
15709   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
15710   ins_encode %{
15711     int vector_len = 2;
15712     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15713   %}
15714   ins_pipe( pipe_slow );
15715 %}
15716 
15717 // ------------------------------ LeftShift -----------------------------------
15718 
15719 // Shorts/Chars vector left shift
15720 instruct vsll2S(vecS dst, vecS shift) %{
15721   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15722   match(Set dst (LShiftVS dst shift));
15723   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
15724   ins_encode %{
15725     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15726   %}
15727   ins_pipe( pipe_slow );
15728 %}
15729 
15730 instruct vsll2S_imm(vecS dst, immI8 shift) %{
15731   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15732   match(Set dst (LShiftVS dst shift));
15733   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
15734   ins_encode %{
15735     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15736   %}
15737   ins_pipe( pipe_slow );
15738 %}
15739 
15740 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
15741   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
15742   match(Set dst (LShiftVS src shift));
15743   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15744   ins_encode %{
15745     int vector_len = 0;
15746     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15747   %}
15748   ins_pipe( pipe_slow );
15749 %}
15750 
15751 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
15752   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
15753   match(Set dst (LShiftVS src shift));
15754   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15755   ins_encode %{
15756     int vector_len = 0;
15757     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15758   %}
15759   ins_pipe( pipe_slow );
15760 %}
15761 
15762 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
15763   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
15764   match(Set dst (LShiftVS dst shift));
15765   effect(TEMP src);
15766   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15767   ins_encode %{
15768     int vector_len = 0;
15769     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15770   %}
15771   ins_pipe( pipe_slow );
15772 %}
15773 
15774 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
15775   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
15776   match(Set dst (LShiftVS src shift));
15777   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15778   ins_encode %{
15779     int vector_len = 0;
15780     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15781   %}
15782   ins_pipe( pipe_slow );
15783 %}
15784 
15785 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
15786   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
15787   match(Set dst (LShiftVS src shift));
15788   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15789   ins_encode %{
15790     int vector_len = 0;
15791     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15792   %}
15793   ins_pipe( pipe_slow );
15794 %}
15795 
15796 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
15797   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
15798   match(Set dst (LShiftVS dst shift));
15799   effect(TEMP src);
15800   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15801   ins_encode %{
15802     int vector_len = 0;
15803     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15804   %}
15805   ins_pipe( pipe_slow );
15806 %}
15807 
15808 instruct vsll4S(vecD dst, vecS shift) %{
15809   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15810   match(Set dst (LShiftVS dst shift));
15811   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
15812   ins_encode %{
15813     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15814   %}
15815   ins_pipe( pipe_slow );
15816 %}
15817 
15818 instruct vsll4S_imm(vecD dst, immI8 shift) %{
15819   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15820   match(Set dst (LShiftVS dst shift));
15821   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
15822   ins_encode %{
15823     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15824   %}
15825   ins_pipe( pipe_slow );
15826 %}
15827 
15828 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
15829   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
15830   match(Set dst (LShiftVS src shift));
15831   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15832   ins_encode %{
15833     int vector_len = 0;
15834     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15835   %}
15836   ins_pipe( pipe_slow );
15837 %}
15838 
15839 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
15840   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
15841   match(Set dst (LShiftVS src shift));
15842   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15843   ins_encode %{
15844     int vector_len = 0;
15845     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15846   %}
15847   ins_pipe( pipe_slow );
15848 %}
15849 
15850 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
15851   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
15852   match(Set dst (LShiftVS dst shift));
15853   effect(TEMP src);
15854   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15855   ins_encode %{
15856     int vector_len = 0;
15857     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15858   %}
15859   ins_pipe( pipe_slow );
15860 %}
15861 
15862 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
15863   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
15864   match(Set dst (LShiftVS src shift));
15865   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15866   ins_encode %{
15867     int vector_len = 0;
15868     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15869   %}
15870   ins_pipe( pipe_slow );
15871 %}
15872 
15873 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
15874   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
15875   match(Set dst (LShiftVS src shift));
15876   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15877   ins_encode %{
15878     int vector_len = 0;
15879     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15880   %}
15881   ins_pipe( pipe_slow );
15882 %}
15883 
15884 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
15885   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
15886   match(Set dst (LShiftVS dst shift));
15887   effect(TEMP src);
15888   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15889   ins_encode %{
15890     int vector_len = 0;
15891     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15892   %}
15893   ins_pipe( pipe_slow );
15894 %}
15895 
15896 instruct vsll8S(vecX dst, vecS shift) %{
15897   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
15898   match(Set dst (LShiftVS dst shift));
15899   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
15900   ins_encode %{
15901     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15902   %}
15903   ins_pipe( pipe_slow );
15904 %}
15905 
15906 instruct vsll8S_imm(vecX dst, immI8 shift) %{
15907   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
15908   match(Set dst (LShiftVS dst shift));
15909   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
15910   ins_encode %{
15911     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15912   %}
15913   ins_pipe( pipe_slow );
15914 %}
15915 
15916 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
15917   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
15918   match(Set dst (LShiftVS src shift));
15919   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15920   ins_encode %{
15921     int vector_len = 0;
15922     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15923   %}
15924   ins_pipe( pipe_slow );
15925 %}
15926 
15927 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
15928   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
15929   match(Set dst (LShiftVS src shift));
15930   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15931   ins_encode %{
15932     int vector_len = 0;
15933     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15934   %}
15935   ins_pipe( pipe_slow );
15936 %}
15937 
15938 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
15939   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
15940   match(Set dst (LShiftVS dst shift));
15941   effect(TEMP src);
15942   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15943   ins_encode %{
15944     int vector_len = 0;
15945     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15946   %}
15947   ins_pipe( pipe_slow );
15948 %}
15949 
15950 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
15951   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
15952   match(Set dst (LShiftVS src shift));
15953   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15954   ins_encode %{
15955     int vector_len = 0;
15956     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15957   %}
15958   ins_pipe( pipe_slow );
15959 %}
15960 
15961 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
15962   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
15963   match(Set dst (LShiftVS src shift));
15964   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15965   ins_encode %{
15966     int vector_len = 0;
15967     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15968   %}
15969   ins_pipe( pipe_slow );
15970 %}
15971 
15972 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
15973   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
15974   match(Set dst (LShiftVS dst shift));
15975   effect(TEMP src);
15976   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15977   ins_encode %{
15978     int vector_len = 0;
15979     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15980   %}
15981   ins_pipe( pipe_slow );
15982 %}
15983 
15984 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
15985   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
15986   match(Set dst (LShiftVS src shift));
15987   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
15988   ins_encode %{
15989     int vector_len = 1;
15990     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15991   %}
15992   ins_pipe( pipe_slow );
15993 %}
15994 
15995 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
15996   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
15997   match(Set dst (LShiftVS src shift));
15998   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
15999   ins_encode %{
16000     int vector_len = 1;
16001     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16002   %}
16003   ins_pipe( pipe_slow );
16004 %}
16005 
16006 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
16007   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
16008   match(Set dst (LShiftVS dst shift));
16009   effect(TEMP src);
16010   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
16011   ins_encode %{
16012     int vector_len = 1;
16013     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16014   %}
16015   ins_pipe( pipe_slow );
16016 %}
16017 
16018 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
16019   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
16020   match(Set dst (LShiftVS src shift));
16021   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
16022   ins_encode %{
16023     int vector_len = 1;
16024     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16025   %}
16026   ins_pipe( pipe_slow );
16027 %}
16028 
16029 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
16030   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
16031   match(Set dst (LShiftVS src shift));
16032   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
16033   ins_encode %{
16034     int vector_len = 1;
16035     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16036   %}
16037   ins_pipe( pipe_slow );
16038 %}
16039 
16040 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
16041   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
16042   match(Set dst (LShiftVS dst shift));
16043   effect(TEMP src);
16044   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
16045   ins_encode %{
16046     int vector_len = 1;
16047     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16048   %}
16049   ins_pipe( pipe_slow );
16050 %}
16051 
16052 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
16053   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16054   match(Set dst (LShiftVS src shift));
16055   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
16056   ins_encode %{
16057     int vector_len = 2;
16058     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16059   %}
16060   ins_pipe( pipe_slow );
16061 %}
16062 
16063 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16064   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16065   match(Set dst (LShiftVS src shift));
16066   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
16067   ins_encode %{
16068     int vector_len = 2;
16069     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16070   %}
16071   ins_pipe( pipe_slow );
16072 %}
16073 
16074 // Integers vector left shift
16075 instruct vsll2I(vecD dst, vecS shift) %{
16076   predicate(n->as_Vector()->length() == 2);
16077   match(Set dst (LShiftVI dst shift));
16078   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
16079   ins_encode %{
16080     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
16081   %}
16082   ins_pipe( pipe_slow );
16083 %}
16084 
16085 instruct vsll2I_imm(vecD dst, immI8 shift) %{
16086   predicate(n->as_Vector()->length() == 2);
16087   match(Set dst (LShiftVI dst shift));
16088   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
16089   ins_encode %{
16090     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
16091   %}
16092   ins_pipe( pipe_slow );
16093 %}
16094 
16095 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
16096   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16097   match(Set dst (LShiftVI src shift));
16098   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
16099   ins_encode %{
16100     int vector_len = 0;
16101     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16102   %}
16103   ins_pipe( pipe_slow );
16104 %}
16105 
16106 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
16107   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16108   match(Set dst (LShiftVI src shift));
16109   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
16110   ins_encode %{
16111     int vector_len = 0;
16112     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16113   %}
16114   ins_pipe( pipe_slow );
16115 %}
16116 
16117 instruct vsll4I(vecX dst, vecS shift) %{
16118   predicate(n->as_Vector()->length() == 4);
16119   match(Set dst (LShiftVI dst shift));
16120   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
16121   ins_encode %{
16122     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
16123   %}
16124   ins_pipe( pipe_slow );
16125 %}
16126 
16127 instruct vsll4I_imm(vecX dst, immI8 shift) %{
16128   predicate(n->as_Vector()->length() == 4);
16129   match(Set dst (LShiftVI dst shift));
16130   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
16131   ins_encode %{
16132     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
16133   %}
16134   ins_pipe( pipe_slow );
16135 %}
16136 
16137 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
16138   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16139   match(Set dst (LShiftVI src shift));
16140   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
16141   ins_encode %{
16142     int vector_len = 0;
16143     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16144   %}
16145   ins_pipe( pipe_slow );
16146 %}
16147 
16148 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
16149   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16150   match(Set dst (LShiftVI src shift));
16151   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
16152   ins_encode %{
16153     int vector_len = 0;
16154     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16155   %}
16156   ins_pipe( pipe_slow );
16157 %}
16158 
16159 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
16160   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16161   match(Set dst (LShiftVI src shift));
16162   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
16163   ins_encode %{
16164     int vector_len = 1;
16165     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16166   %}
16167   ins_pipe( pipe_slow );
16168 %}
16169 
16170 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
16171   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16172   match(Set dst (LShiftVI src shift));
16173   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
16174   ins_encode %{
16175     int vector_len = 1;
16176     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16177   %}
16178   ins_pipe( pipe_slow );
16179 %}
16180 
16181 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
16182   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16183   match(Set dst (LShiftVI src shift));
16184   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
16185   ins_encode %{
16186     int vector_len = 2;
16187     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16188   %}
16189   ins_pipe( pipe_slow );
16190 %}
16191 
16192 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16193   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16194   match(Set dst (LShiftVI src shift));
16195   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
16196   ins_encode %{
16197     int vector_len = 2;
16198     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16199   %}
16200   ins_pipe( pipe_slow );
16201 %}
16202 
16203 // Longs vector left shift
16204 instruct vsll2L(vecX dst, vecS shift) %{
16205   predicate(n->as_Vector()->length() == 2);
16206   match(Set dst (LShiftVL dst shift));
16207   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
16208   ins_encode %{
16209     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
16210   %}
16211   ins_pipe( pipe_slow );
16212 %}
16213 
16214 instruct vsll2L_imm(vecX dst, immI8 shift) %{
16215   predicate(n->as_Vector()->length() == 2);
16216   match(Set dst (LShiftVL dst shift));
16217   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
16218   ins_encode %{
16219     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
16220   %}
16221   ins_pipe( pipe_slow );
16222 %}
16223 
16224 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
16225   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16226   match(Set dst (LShiftVL src shift));
16227   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
16228   ins_encode %{
16229     int vector_len = 0;
16230     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16231   %}
16232   ins_pipe( pipe_slow );
16233 %}
16234 
16235 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
16236   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16237   match(Set dst (LShiftVL src shift));
16238   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
16239   ins_encode %{
16240     int vector_len = 0;
16241     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16242   %}
16243   ins_pipe( pipe_slow );
16244 %}
16245 
16246 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
16247   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16248   match(Set dst (LShiftVL src shift));
16249   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
16250   ins_encode %{
16251     int vector_len = 1;
16252     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16253   %}
16254   ins_pipe( pipe_slow );
16255 %}
16256 
16257 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
16258   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16259   match(Set dst (LShiftVL src shift));
16260   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
16261   ins_encode %{
16262     int vector_len = 1;
16263     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16264   %}
16265   ins_pipe( pipe_slow );
16266 %}
16267 
16268 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
16269   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16270   match(Set dst (LShiftVL src shift));
16271   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
16272   ins_encode %{
16273     int vector_len = 2;
16274     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16275   %}
16276   ins_pipe( pipe_slow );
16277 %}
16278 
16279 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16280   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16281   match(Set dst (LShiftVL src shift));
16282   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
16283   ins_encode %{
16284     int vector_len = 2;
16285     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16286   %}
16287   ins_pipe( pipe_slow );
16288 %}
16289 
16290 // ----------------------- LogicalRightShift -----------------------------------
16291 
16292 // Shorts vector logical right shift produces incorrect Java result
16293 // for negative data because java code convert short value into int with
16294 // sign extension before a shift. But char vectors are fine since chars are
16295 // unsigned values.
16296 
16297 instruct vsrl2S(vecS dst, vecS shift) %{
16298   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16299   match(Set dst (URShiftVS dst shift));
16300   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
16301   ins_encode %{
16302     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16303   %}
16304   ins_pipe( pipe_slow );
16305 %}
16306 
16307 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
16308   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16309   match(Set dst (URShiftVS dst shift));
16310   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
16311   ins_encode %{
16312     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16313   %}
16314   ins_pipe( pipe_slow );
16315 %}
16316 
16317 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
16318   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
16319   match(Set dst (URShiftVS src shift));
16320   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16321   ins_encode %{
16322     int vector_len = 0;
16323     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16324   %}
16325   ins_pipe( pipe_slow );
16326 %}
16327 
16328 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
16329   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
16330   match(Set dst (URShiftVS src shift));
16331   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16332   ins_encode %{
16333     int vector_len = 0;
16334     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16335   %}
16336   ins_pipe( pipe_slow );
16337 %}
16338 
16339 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
16340   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
16341   match(Set dst (URShiftVS dst shift));
16342   effect(TEMP src);
16343   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16344   ins_encode %{
16345     int vector_len = 0;
16346     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16347   %}
16348   ins_pipe( pipe_slow );
16349 %}
16350 
16351 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
16352   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
16353   match(Set dst (URShiftVS src shift));
16354   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16355   ins_encode %{
16356     int vector_len = 0;
16357     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16358   %}
16359   ins_pipe( pipe_slow );
16360 %}
16361 
16362 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
16363   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
16364   match(Set dst (URShiftVS src shift));
16365   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16366   ins_encode %{
16367     int vector_len = 0;
16368     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16369   %}
16370   ins_pipe( pipe_slow );
16371 %}
16372 
16373 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
16374   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
16375   match(Set dst (URShiftVS dst shift));
16376   effect(TEMP src);
16377   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16378   ins_encode %{
16379     int vector_len = 0;
16380     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16381   %}
16382   ins_pipe( pipe_slow );
16383 %}
16384 
16385 instruct vsrl4S(vecD dst, vecS shift) %{
16386   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16387   match(Set dst (URShiftVS dst shift));
16388   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
16389   ins_encode %{
16390     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16391   %}
16392   ins_pipe( pipe_slow );
16393 %}
16394 
16395 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
16396   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16397   match(Set dst (URShiftVS dst shift));
16398   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
16399   ins_encode %{
16400     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16401   %}
16402   ins_pipe( pipe_slow );
16403 %}
16404 
16405 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
16406   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
16407   match(Set dst (URShiftVS src shift));
16408   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16409   ins_encode %{
16410     int vector_len = 0;
16411     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16412   %}
16413   ins_pipe( pipe_slow );
16414 %}
16415 
16416 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
16417   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
16418   match(Set dst (URShiftVS src shift));
16419   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16420   ins_encode %{
16421     int vector_len = 0;
16422     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16423   %}
16424   ins_pipe( pipe_slow );
16425 %}
16426 
16427 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
16428   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
16429   match(Set dst (URShiftVS dst shift));
16430   effect(TEMP src);
16431   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16432   ins_encode %{
16433     int vector_len = 0;
16434     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16435   %}
16436   ins_pipe( pipe_slow );
16437 %}
16438 
16439 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
16440   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
16441   match(Set dst (URShiftVS src shift));
16442   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16443   ins_encode %{
16444     int vector_len = 0;
16445     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16446   %}
16447   ins_pipe( pipe_slow );
16448 %}
16449 
16450 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
16451   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
16452   match(Set dst (URShiftVS src shift));
16453   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16454   ins_encode %{
16455     int vector_len = 0;
16456     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16457   %}
16458   ins_pipe( pipe_slow );
16459 %}
16460 
16461 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
16462   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
16463   match(Set dst (URShiftVS dst shift));
16464   effect(TEMP src);
16465   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16466   ins_encode %{
16467     int vector_len = 0;
16468     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16469   %}
16470   ins_pipe( pipe_slow );
16471 %}
16472 
16473 instruct vsrl8S(vecX dst, vecS shift) %{
16474   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16475   match(Set dst (URShiftVS dst shift));
16476   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
16477   ins_encode %{
16478     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16479   %}
16480   ins_pipe( pipe_slow );
16481 %}
16482 
16483 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
16484   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16485   match(Set dst (URShiftVS dst shift));
16486   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
16487   ins_encode %{
16488     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16489   %}
16490   ins_pipe( pipe_slow );
16491 %}
16492 
16493 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
16494   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
16495   match(Set dst (URShiftVS src shift));
16496   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16497   ins_encode %{
16498     int vector_len = 0;
16499     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16500   %}
16501   ins_pipe( pipe_slow );
16502 %}
16503 
16504 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
16505   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
16506   match(Set dst (URShiftVS src shift));
16507   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16508   ins_encode %{
16509     int vector_len = 0;
16510     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16511   %}
16512   ins_pipe( pipe_slow );
16513 %}
16514 
16515 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
16516   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
16517   match(Set dst (URShiftVS dst shift));
16518   effect(TEMP src);
16519   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16520   ins_encode %{
16521     int vector_len = 0;
16522     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16523   %}
16524   ins_pipe( pipe_slow );
16525 %}
16526 
16527 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
16528   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
16529   match(Set dst (URShiftVS src shift));
16530   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16531   ins_encode %{
16532     int vector_len = 0;
16533     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16534   %}
16535   ins_pipe( pipe_slow );
16536 %}
16537 
16538 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
16539   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
16540   match(Set dst (URShiftVS src shift));
16541   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16542   ins_encode %{
16543     int vector_len = 0;
16544     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16545   %}
16546   ins_pipe( pipe_slow );
16547 %}
16548 
16549 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
16550   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
16551   match(Set dst (URShiftVS dst shift));
16552   effect(TEMP src);
16553   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16554   ins_encode %{
16555     int vector_len = 0;
16556     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16557   %}
16558   ins_pipe( pipe_slow );
16559 %}
16560 
16561 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
16562   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
16563   match(Set dst (URShiftVS src shift));
16564   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16565   ins_encode %{
16566     int vector_len = 1;
16567     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16568   %}
16569   ins_pipe( pipe_slow );
16570 %}
16571 
16572 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
16573   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
16574   match(Set dst (URShiftVS src shift));
16575   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16576   ins_encode %{
16577     int vector_len = 1;
16578     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16579   %}
16580   ins_pipe( pipe_slow );
16581 %}
16582 
16583 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
16584   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
16585   match(Set dst (URShiftVS dst shift));
16586   effect(TEMP src);
16587   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16588   ins_encode %{
16589     int vector_len = 1;
16590     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16591   %}
16592   ins_pipe( pipe_slow );
16593 %}
16594 
16595 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
16596   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
16597   match(Set dst (URShiftVS src shift));
16598   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16599   ins_encode %{
16600     int vector_len = 1;
16601     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16602   %}
16603   ins_pipe( pipe_slow );
16604 %}
16605 
16606 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
16607   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
16608   match(Set dst (URShiftVS src shift));
16609   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16610   ins_encode %{
16611     int vector_len = 1;
16612     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16613   %}
16614   ins_pipe( pipe_slow );
16615 %}
16616 
16617 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
16618   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
16619   match(Set dst (URShiftVS dst shift));
16620   effect(TEMP src);
16621   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16622   ins_encode %{
16623     int vector_len = 1;
16624     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16625   %}
16626   ins_pipe( pipe_slow );
16627 %}
16628 
16629 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
16630   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16631   match(Set dst (URShiftVS src shift));
16632   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
16633   ins_encode %{
16634     int vector_len = 2;
16635     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16636   %}
16637   ins_pipe( pipe_slow );
16638 %}
16639 
16640 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16641   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16642   match(Set dst (URShiftVS src shift));
16643   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
16644   ins_encode %{
16645     int vector_len = 2;
16646     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16647   %}
16648   ins_pipe( pipe_slow );
16649 %}
16650 
16651 // Integers vector logical right shift
16652 instruct vsrl2I(vecD dst, vecS shift) %{
16653   predicate(n->as_Vector()->length() == 2);
16654   match(Set dst (URShiftVI dst shift));
16655   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
16656   ins_encode %{
16657     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
16658   %}
16659   ins_pipe( pipe_slow );
16660 %}
16661 
16662 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
16663   predicate(n->as_Vector()->length() == 2);
16664   match(Set dst (URShiftVI dst shift));
16665   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
16666   ins_encode %{
16667     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
16668   %}
16669   ins_pipe( pipe_slow );
16670 %}
16671 
16672 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
16673   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16674   match(Set dst (URShiftVI src shift));
16675   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
16676   ins_encode %{
16677     int vector_len = 0;
16678     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16679   %}
16680   ins_pipe( pipe_slow );
16681 %}
16682 
16683 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
16684   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16685   match(Set dst (URShiftVI src shift));
16686   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
16687   ins_encode %{
16688     int vector_len = 0;
16689     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16690   %}
16691   ins_pipe( pipe_slow );
16692 %}
16693 
16694 instruct vsrl4I(vecX dst, vecS shift) %{
16695   predicate(n->as_Vector()->length() == 4);
16696   match(Set dst (URShiftVI dst shift));
16697   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
16698   ins_encode %{
16699     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
16700   %}
16701   ins_pipe( pipe_slow );
16702 %}
16703 
16704 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
16705   predicate(n->as_Vector()->length() == 4);
16706   match(Set dst (URShiftVI dst shift));
16707   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
16708   ins_encode %{
16709     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
16710   %}
16711   ins_pipe( pipe_slow );
16712 %}
16713 
16714 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
16715   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16716   match(Set dst (URShiftVI src shift));
16717   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
16718   ins_encode %{
16719     int vector_len = 0;
16720     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16721   %}
16722   ins_pipe( pipe_slow );
16723 %}
16724 
16725 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
16726   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16727   match(Set dst (URShiftVI src shift));
16728   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
16729   ins_encode %{
16730     int vector_len = 0;
16731     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16732   %}
16733   ins_pipe( pipe_slow );
16734 %}
16735 
16736 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
16737   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16738   match(Set dst (URShiftVI src shift));
16739   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
16740   ins_encode %{
16741     int vector_len = 1;
16742     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16743   %}
16744   ins_pipe( pipe_slow );
16745 %}
16746 
16747 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
16748   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16749   match(Set dst (URShiftVI src shift));
16750   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
16751   ins_encode %{
16752     int vector_len = 1;
16753     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16754   %}
16755   ins_pipe( pipe_slow );
16756 %}
16757 
16758 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
16759   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16760   match(Set dst (URShiftVI src shift));
16761   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
16762   ins_encode %{
16763     int vector_len = 2;
16764     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16765   %}
16766   ins_pipe( pipe_slow );
16767 %}
16768 
16769 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16770   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16771   match(Set dst (URShiftVI src shift));
16772   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
16773   ins_encode %{
16774     int vector_len = 2;
16775     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16776   %}
16777   ins_pipe( pipe_slow );
16778 %}
16779 
16780 // Longs vector logical right shift
16781 instruct vsrl2L(vecX dst, vecS shift) %{
16782   predicate(n->as_Vector()->length() == 2);
16783   match(Set dst (URShiftVL dst shift));
16784   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
16785   ins_encode %{
16786     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
16787   %}
16788   ins_pipe( pipe_slow );
16789 %}
16790 
16791 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
16792   predicate(n->as_Vector()->length() == 2);
16793   match(Set dst (URShiftVL dst shift));
16794   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
16795   ins_encode %{
16796     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
16797   %}
16798   ins_pipe( pipe_slow );
16799 %}
16800 
16801 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
16802   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16803   match(Set dst (URShiftVL src shift));
16804   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
16805   ins_encode %{
16806     int vector_len = 0;
16807     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16808   %}
16809   ins_pipe( pipe_slow );
16810 %}
16811 
16812 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
16813   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16814   match(Set dst (URShiftVL src shift));
16815   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
16816   ins_encode %{
16817     int vector_len = 0;
16818     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16819   %}
16820   ins_pipe( pipe_slow );
16821 %}
16822 
16823 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
16824   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16825   match(Set dst (URShiftVL src shift));
16826   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
16827   ins_encode %{
16828     int vector_len = 1;
16829     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16830   %}
16831   ins_pipe( pipe_slow );
16832 %}
16833 
16834 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
16835   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16836   match(Set dst (URShiftVL src shift));
16837   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
16838   ins_encode %{
16839     int vector_len = 1;
16840     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16841   %}
16842   ins_pipe( pipe_slow );
16843 %}
16844 
16845 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
16846   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16847   match(Set dst (URShiftVL src shift));
16848   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
16849   ins_encode %{
16850     int vector_len = 2;
16851     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16852   %}
16853   ins_pipe( pipe_slow );
16854 %}
16855 
16856 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16857   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16858   match(Set dst (URShiftVL src shift));
16859   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
16860   ins_encode %{
16861     int vector_len = 2;
16862     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16863   %}
16864   ins_pipe( pipe_slow );
16865 %}
16866 
16867 // ------------------- ArithmeticRightShift -----------------------------------
16868 
16869 // Shorts/Chars vector arithmetic right shift
16870 instruct vsra2S(vecS dst, vecS shift) %{
16871   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16872   match(Set dst (RShiftVS dst shift));
16873   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
16874   ins_encode %{
16875     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16876   %}
16877   ins_pipe( pipe_slow );
16878 %}
16879 
16880 instruct vsra2S_imm(vecS dst, immI8 shift) %{
16881   predicate(n->as_Vector()->length() == 2);
16882   match(Set dst (RShiftVS dst shift));
16883   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
16884   ins_encode %{
16885     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16886   %}
16887   ins_pipe( pipe_slow );
16888 %}
16889 
16890 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
16891   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
16892   match(Set dst (RShiftVS src shift));
16893   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16894   ins_encode %{
16895     int vector_len = 0;
16896     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16897   %}
16898   ins_pipe( pipe_slow );
16899 %}
16900 
16901 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
16902   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
16903   match(Set dst (RShiftVS src shift));
16904   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16905   ins_encode %{
16906     int vector_len = 0;
16907     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16908   %}
16909   ins_pipe( pipe_slow );
16910 %}
16911 
16912 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
16913   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
16914   match(Set dst (RShiftVS dst shift));
16915   effect(TEMP src);
16916   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16917   ins_encode %{
16918     int vector_len = 0;
16919     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16920   %}
16921   ins_pipe( pipe_slow );
16922 %}
16923 
16924 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
16925   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
16926   match(Set dst (RShiftVS src shift));
16927   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16928   ins_encode %{
16929     int vector_len = 0;
16930     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16931   %}
16932   ins_pipe( pipe_slow );
16933 %}
16934 
16935 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
16936   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
16937   match(Set dst (RShiftVS src shift));
16938   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16939   ins_encode %{
16940     int vector_len = 0;
16941     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16942   %}
16943   ins_pipe( pipe_slow );
16944 %}
16945 
16946 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
16947   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
16948   match(Set dst (RShiftVS dst shift));
16949   effect(TEMP src);
16950   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16951   ins_encode %{
16952     int vector_len = 0;
16953     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16954   %}
16955   ins_pipe( pipe_slow );
16956 %}
16957 
16958 instruct vsra4S(vecD dst, vecS shift) %{
16959   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16960   match(Set dst (RShiftVS dst shift));
16961   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
16962   ins_encode %{
16963     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16964   %}
16965   ins_pipe( pipe_slow );
16966 %}
16967 
16968 instruct vsra4S_imm(vecD dst, immI8 shift) %{
16969   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16970   match(Set dst (RShiftVS dst shift));
16971   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
16972   ins_encode %{
16973     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16974   %}
16975   ins_pipe( pipe_slow );
16976 %}
16977 
16978 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
16979   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
16980   match(Set dst (RShiftVS src shift));
16981   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
16982   ins_encode %{
16983     int vector_len = 0;
16984     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16985   %}
16986   ins_pipe( pipe_slow );
16987 %}
16988 
16989 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
16990   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
16991   match(Set dst (RShiftVS src shift));
16992   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
16993   ins_encode %{
16994     int vector_len = 0;
16995     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16996   %}
16997   ins_pipe( pipe_slow );
16998 %}
16999 
17000 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
17001   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
17002   match(Set dst (RShiftVS dst shift));
17003   effect(TEMP src);
17004   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
17005   ins_encode %{
17006     int vector_len = 0;
17007     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17008   %}
17009   ins_pipe( pipe_slow );
17010 %}
17011 
17012 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
17013   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
17014   match(Set dst (RShiftVS src shift));
17015   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
17016   ins_encode %{
17017     int vector_len = 0;
17018     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17019   %}
17020   ins_pipe( pipe_slow );
17021 %}
17022 
17023 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
17024   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
17025   match(Set dst (RShiftVS src shift));
17026   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
17027   ins_encode %{
17028     int vector_len = 0;
17029     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17030   %}
17031   ins_pipe( pipe_slow );
17032 %}
17033 
17034 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
17035   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
17036   match(Set dst (RShiftVS dst shift));
17037   effect(TEMP src);
17038   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
17039   ins_encode %{
17040     int vector_len = 0;
17041     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17042   %}
17043   ins_pipe( pipe_slow );
17044 %}
17045 
17046 instruct vsra8S(vecX dst, vecS shift) %{
17047   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
17048   match(Set dst (RShiftVS dst shift));
17049   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
17050   ins_encode %{
17051     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
17052   %}
17053   ins_pipe( pipe_slow );
17054 %}
17055 
17056 instruct vsra8S_imm(vecX dst, immI8 shift) %{
17057   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
17058   match(Set dst (RShiftVS dst shift));
17059   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
17060   ins_encode %{
17061     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
17062   %}
17063   ins_pipe( pipe_slow );
17064 %}
17065 
17066 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
17067   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
17068   match(Set dst (RShiftVS src shift));
17069   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
17070   ins_encode %{
17071     int vector_len = 0;
17072     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17073   %}
17074   ins_pipe( pipe_slow );
17075 %}
17076 
17077 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
17078   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
17079   match(Set dst (RShiftVS src shift));
17080   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
17081   ins_encode %{
17082     int vector_len = 0;
17083     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17084   %}
17085   ins_pipe( pipe_slow );
17086 %}
17087 
17088 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
17089   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
17090   match(Set dst (RShiftVS dst shift));
17091   effect(TEMP src);
17092   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
17093   ins_encode %{
17094     int vector_len = 0;
17095     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17096   %}
17097   ins_pipe( pipe_slow );
17098 %}
17099 
17100 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
17101   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
17102   match(Set dst (RShiftVS src shift));
17103   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
17104   ins_encode %{
17105     int vector_len = 0;
17106     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17107   %}
17108   ins_pipe( pipe_slow );
17109 %}
17110 
17111 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
17112   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
17113   match(Set dst (RShiftVS src shift));
17114   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
17115   ins_encode %{
17116     int vector_len = 0;
17117     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17118   %}
17119   ins_pipe( pipe_slow );
17120 %}
17121 
17122 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
17123   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
17124   match(Set dst (RShiftVS dst shift));
17125   effect(TEMP src);
17126   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
17127   ins_encode %{
17128     int vector_len = 0;
17129     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17130   %}
17131   ins_pipe( pipe_slow );
17132 %}
17133 
17134 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
17135   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
17136   match(Set dst (RShiftVS src shift));
17137   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
17138   ins_encode %{
17139     int vector_len = 1;
17140     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17141   %}
17142   ins_pipe( pipe_slow );
17143 %}
17144 
17145 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
17146   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
17147   match(Set dst (RShiftVS src shift));
17148   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
17149   ins_encode %{
17150     int vector_len = 1;
17151     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17152   %}
17153   ins_pipe( pipe_slow );
17154 %}
17155 
17156 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
17157   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
17158   match(Set dst (RShiftVS dst shift));
17159   effect(TEMP src);
17160   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
17161   ins_encode %{
17162     int vector_len = 1;
17163     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17164   %}
17165   ins_pipe( pipe_slow );
17166 %}
17167 
17168 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
17169   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
17170   match(Set dst (RShiftVS src shift));
17171   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
17172   ins_encode %{
17173     int vector_len = 1;
17174     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17175   %}
17176   ins_pipe( pipe_slow );
17177 %}
17178 
17179 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
17180   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
17181   match(Set dst (RShiftVS src shift));
17182   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
17183   ins_encode %{
17184     int vector_len = 1;
17185     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17186   %}
17187   ins_pipe( pipe_slow );
17188 %}
17189 
17190 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
17191   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
17192   match(Set dst (RShiftVS dst shift));
17193   effect(TEMP src);
17194   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
17195   ins_encode %{
17196     int vector_len = 1;
17197     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17198   %}
17199   ins_pipe( pipe_slow );
17200 %}
17201 
17202 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
17203   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
17204   match(Set dst (RShiftVS src shift));
17205   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
17206   ins_encode %{
17207     int vector_len = 2;
17208     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17209   %}
17210   ins_pipe( pipe_slow );
17211 %}
17212 
17213 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
17214   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
17215   match(Set dst (RShiftVS src shift));
17216   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
17217   ins_encode %{
17218     int vector_len = 2;
17219     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17220   %}
17221   ins_pipe( pipe_slow );
17222 %}
17223 
17224 // Integers vector arithmetic right shift
17225 instruct vsra2I(vecD dst, vecS shift) %{
17226   predicate(n->as_Vector()->length() == 2);
17227   match(Set dst (RShiftVI dst shift));
17228   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
17229   ins_encode %{
17230     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
17231   %}
17232   ins_pipe( pipe_slow );
17233 %}
17234 
17235 instruct vsra2I_imm(vecD dst, immI8 shift) %{
17236   predicate(n->as_Vector()->length() == 2);
17237   match(Set dst (RShiftVI dst shift));
17238   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
17239   ins_encode %{
17240     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
17241   %}
17242   ins_pipe( pipe_slow );
17243 %}
17244 
17245 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
17246   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
17247   match(Set dst (RShiftVI src shift));
17248   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
17249   ins_encode %{
17250     int vector_len = 0;
17251     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17252   %}
17253   ins_pipe( pipe_slow );
17254 %}
17255 
17256 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
17257   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
17258   match(Set dst (RShiftVI src shift));
17259   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
17260   ins_encode %{
17261     int vector_len = 0;
17262     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17263   %}
17264   ins_pipe( pipe_slow );
17265 %}
17266 
17267 instruct vsra4I(vecX dst, vecS shift) %{
17268   predicate(n->as_Vector()->length() == 4);
17269   match(Set dst (RShiftVI dst shift));
17270   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
17271   ins_encode %{
17272     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
17273   %}
17274   ins_pipe( pipe_slow );
17275 %}
17276 
17277 instruct vsra4I_imm(vecX dst, immI8 shift) %{
17278   predicate(n->as_Vector()->length() == 4);
17279   match(Set dst (RShiftVI dst shift));
17280   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
17281   ins_encode %{
17282     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
17283   %}
17284   ins_pipe( pipe_slow );
17285 %}
17286 
17287 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
17288   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
17289   match(Set dst (RShiftVI src shift));
17290   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
17291   ins_encode %{
17292     int vector_len = 0;
17293     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17294   %}
17295   ins_pipe( pipe_slow );
17296 %}
17297 
17298 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
17299   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
17300   match(Set dst (RShiftVI src shift));
17301   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
17302   ins_encode %{
17303     int vector_len = 0;
17304     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17305   %}
17306   ins_pipe( pipe_slow );
17307 %}
17308 
17309 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
17310   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
17311   match(Set dst (RShiftVI src shift));
17312   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
17313   ins_encode %{
17314     int vector_len = 1;
17315     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17316   %}
17317   ins_pipe( pipe_slow );
17318 %}
17319 
17320 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
17321   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
17322   match(Set dst (RShiftVI src shift));
17323   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
17324   ins_encode %{
17325     int vector_len = 1;
17326     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17327   %}
17328   ins_pipe( pipe_slow );
17329 %}
17330 
17331 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
17332   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
17333   match(Set dst (RShiftVI src shift));
17334   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
17335   ins_encode %{
17336     int vector_len = 2;
17337     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17338   %}
17339   ins_pipe( pipe_slow );
17340 %}
17341 
17342 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
17343   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
17344   match(Set dst (RShiftVI src shift));
17345   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
17346   ins_encode %{
17347     int vector_len = 2;
17348     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17349   %}
17350   ins_pipe( pipe_slow );
17351 %}
17352 
17353 // Long vector arithmetic right shift
17354 instruct vsra1L(vecD dst, vecD src, vecS shift, vecD tmp) %{
17355   predicate(n->as_Vector()->length() == 1);
17356   match(Set dst (RShiftVL src shift));
17357   effect(TEMP dst, TEMP tmp);
17358   format %{ "movdqu  $dst,$src\n\t"
17359             "psrlq   $dst,$shift\n\t"
17360             "movdqu  $tmp,[0x8000000000000000]\n\t"
17361             "psrlq   $tmp,$shift\n\t"
17362             "pxor    $dst,$tmp\n\t"
17363             "psubq   $dst,$tmp\t! arithmetic right shift packed1L" %}
17364   ins_encode %{
17365     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
17366     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
17367     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17368     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
17369     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
17370     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
17371   %}
17372   ins_pipe( pipe_slow );
17373 %}
17374 
17375 instruct vsra1L_imm(vecD dst, vecD src, immI8 shift, vecD tmp) %{
17376   predicate(n->as_Vector()->length() == 1);
17377   match(Set dst (RShiftVL src shift));
17378   effect(TEMP dst, TEMP tmp);
17379   format %{ "movdqu  $dst,$src\n\t"
17380             "psrlq   $dst,$shift\n\t"
17381             "movdqu  $tmp,[0x8000000000000000]\n\t"
17382             "psrlq   $tmp,$shift\n\t"
17383             "pxor    $dst,$tmp\n\t"
17384             "psubq   $dst,$tmp\t! arithmetic right shift packed1L" %}
17385   ins_encode %{
17386     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
17387     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
17388     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17389     __ psrlq($tmp$$XMMRegister, (int)$shift$$constant);
17390     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
17391     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
17392   %}
17393   ins_pipe( pipe_slow );
17394 %}
17395 
17396 instruct vsra1L_reg(vecD dst, vecD src, vecS shift, vecD tmp) %{
17397   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
17398   match(Set dst (RShiftVL src shift));
17399   effect(TEMP dst, TEMP tmp);
17400   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17401             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17402             "vpsrlq   $tmp,$tmp,$shift\n\t"
17403             "vpxor    $dst,$dst,$tmp\n\t"
17404             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed1L" %}
17405   ins_encode %{
17406     int vector_len = 0;
17407     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17408     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17409     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17410     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17411     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17412   %}
17413   ins_pipe( pipe_slow );
17414 %}
17415 
17416 instruct vsra1L_reg_imm(vecD dst, vecD src, immI8 shift, vecD tmp) %{
17417   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
17418   match(Set dst (RShiftVL src shift));
17419   effect(TEMP dst, TEMP tmp);
17420   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17421             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17422             "vpsrlq   $tmp,$tmp,$shift\n\t"
17423             "vpxor    $dst,$dst,$tmp\n\t"
17424             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed1L" %}
17425   ins_encode %{
17426     int vector_len = 0;
17427     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17428     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17429     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
17430     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17431     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17432   %}
17433   ins_pipe( pipe_slow );
17434 %}
17435 
17436 instruct vsra1L_reg_evex(vecD dst, vecD src, vecS shift) %{
17437   predicate(UseAVX > 2 && n->as_Vector()->length() == 1);
17438   match(Set dst (RShiftVL src shift));
17439   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed1L" %}
17440   ins_encode %{
17441     int vector_len = 0;
17442     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17443   %}
17444   ins_pipe( pipe_slow );
17445 %}
17446 
17447 instruct vsra2L_reg_imm(vecX dst, vecX src, immI8 shift, vecX tmp) %{
17448   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
17449   match(Set dst (RShiftVL src shift));
17450   effect(TEMP dst, TEMP tmp);
17451   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17452             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17453             "vpsrlq   $tmp,$tmp,$shift\n\t"
17454             "vpxor    $dst,$dst,$tmp\n\t"
17455             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed2L" %}
17456   ins_encode %{
17457     int vector_len = 0;
17458     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17459     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17460     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
17461     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17462     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17463   %}
17464   ins_pipe( pipe_slow );
17465 %}
17466 
17467 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp) %{
17468   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
17469   match(Set dst (RShiftVL src shift));
17470   effect(TEMP dst, TEMP tmp);
17471   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17472             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17473             "vpsrlq   $tmp,$tmp,$shift\n\t"
17474             "vpxor    $dst,$dst,$tmp\n\t"
17475             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed2L" %}
17476   ins_encode %{
17477     int vector_len = 0;
17478     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17479     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17480     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17481     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17482     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17483   %}
17484   ins_pipe( pipe_slow );
17485 %}
17486 
17487 instruct vsra2L_reg_evex_imm(vecX dst, vecX src, immI8 shift) %{
17488   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
17489   match(Set dst (RShiftVL src shift));
17490   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17491   ins_encode %{
17492     int vector_len = 0;
17493     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17494   %}
17495   ins_pipe( pipe_slow );
17496 %}
17497 
17498 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
17499   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
17500   match(Set dst (RShiftVL src shift));
17501   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17502   ins_encode %{
17503     int vector_len = 0;
17504     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17505   %}
17506   ins_pipe( pipe_slow );
17507 %}
17508 
17509 instruct vsra4L_reg_imm(vecY dst, vecY src, immI8 shift, vecY tmp) %{
17510   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
17511   match(Set dst (RShiftVL src shift));
17512   effect(TEMP dst, TEMP tmp);
17513   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17514             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17515             "vpsrlq   $tmp,$tmp,$shift\n\t"
17516             "vpxor    $dst,$dst,$tmp\n\t"
17517             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
17518   ins_encode %{
17519     int vector_len = 1;
17520     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17521     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17522     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
17523     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17524     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17525   %}
17526   ins_pipe( pipe_slow );
17527 %}
17528 
17529 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp) %{
17530   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
17531   match(Set dst (RShiftVL src shift));
17532   effect(TEMP dst, TEMP tmp);
17533   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17534             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17535             "vpsrlq   $tmp,$tmp,$shift\n\t"
17536             "vpxor    $dst,$dst,$tmp\n\t"
17537             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
17538   ins_encode %{
17539     int vector_len = 1;
17540     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17541     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17542     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17543     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17544     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17545   %}
17546   ins_pipe( pipe_slow );
17547 %}
17548 
17549 instruct vsra4L_reg_evex_imm(vecY dst, vecY src, immI8 shift) %{
17550   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
17551   match(Set dst (RShiftVL src shift));
17552   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17553   ins_encode %{
17554     int vector_len = 1;
17555     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17556   %}
17557   ins_pipe( pipe_slow );
17558 %}
17559 
17560 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
17561   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
17562   match(Set dst (RShiftVL src shift));
17563   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
17564   ins_encode %{
17565     int vector_len = 1;
17566     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17567   %}
17568   ins_pipe( pipe_slow );
17569 %}
17570 
17571 instruct vsra8L_reg_evex_imm(vecZ dst, vecZ src, immI8 shift) %{
17572   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
17573   match(Set dst (RShiftVL src shift));
17574   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17575   ins_encode %{
17576     int vector_len = 2;
17577     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17578   %}
17579   ins_pipe( pipe_slow );
17580 %}
17581 
17582 instruct vsra8L_reg_evex(vecZ dst, vecZ src, vecS shift) %{
17583   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
17584   match(Set dst (RShiftVL src shift));
17585   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed8L" %}
17586   ins_encode %{
17587     int vector_len = 2;
17588     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17589   %}
17590   ins_pipe( pipe_slow );
17591 %}
17592 
17593 // ------------------- Variable Bit Shift Left Logical -----------------------------
17594 //Integer Variable left shift
17595 instruct vsllv2I(vecD dst, vecD src, vecD shift) %{
17596   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17597   match(Set dst (LShiftVI src shift));
17598   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed2I" %}
17599   ins_encode %{
17600     int vector_len = 0;
17601     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17602   %}
17603   ins_pipe( pipe_slow );
17604 %}
17605 
17606 instruct vsllv4I_reg(vecX dst, vecX src, vecX shift) %{
17607   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17608   match(Set dst (LShiftVI src shift));
17609   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed4I" %}
17610   ins_encode %{
17611     int vector_len = 0;
17612     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17613   %}
17614   ins_pipe( pipe_slow );
17615 %}
17616 
17617 instruct vsllv4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17618   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17619   match(Set dst (LShiftVI src shift));
17620   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed4I" %}
17621   ins_encode %{
17622     int vector_len = 0;
17623     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17624   %}
17625   ins_pipe( pipe_slow );
17626 %}
17627 
17628 instruct vsllv8I_reg(vecY dst, vecY src, vecY shift) %{
17629   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17630   match(Set dst (LShiftVI src shift));
17631   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed8I" %}
17632   ins_encode %{
17633     int vector_len = 1;
17634     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17635   %}
17636   ins_pipe( pipe_slow );
17637 %}
17638 
17639 instruct vsllv8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17640   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17641   match(Set dst (LShiftVI src shift));
17642   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed8I" %}
17643   ins_encode %{
17644     int vector_len = 1;
17645     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17646   %}
17647   ins_pipe( pipe_slow );
17648 %}
17649 
17650 instruct vsllv16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17651   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_LShiftCntV);
17652   match(Set dst (LShiftVI src shift));
17653   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed16I" %}
17654   ins_encode %{
17655     int vector_len = 2;
17656     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17657   %}
17658   ins_pipe( pipe_slow );
17659 %}
17660 
17661 //Long Variable left shift
17662 instruct vsllv1L_reg(vecD dst, vecD src, vecD shift) %{
17663   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_LShiftCntV);
17664   match(Set dst (LShiftVL src shift));
17665   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed1L" %}
17666   ins_encode %{
17667     int vector_len = 0;
17668     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17669   %}
17670   ins_pipe( pipe_slow );
17671 %}
17672 
17673 instruct vsllv2L_reg(vecX dst, vecX src, vecX shift) %{
17674   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17675   match(Set dst (LShiftVL src shift));
17676   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed2L" %}
17677   ins_encode %{
17678     int vector_len = 0;
17679     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17680   %}
17681   ins_pipe( pipe_slow );
17682 %}
17683 
17684 instruct vsllv2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17685   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17686   match(Set dst (LShiftVL src shift));
17687   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed2L" %}
17688   ins_encode %{
17689     int vector_len = 0;
17690     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17691   %}
17692   ins_pipe( pipe_slow );
17693 %}
17694 
17695 instruct vsllv4L_reg(vecY dst, vecY src, vecY shift) %{
17696   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17697   match(Set dst (LShiftVL src shift));
17698   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed4L" %}
17699   ins_encode %{
17700     int vector_len = 1;
17701     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17702   %}
17703   ins_pipe( pipe_slow );
17704 %}
17705 
17706 instruct vsllv4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17707   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17708   match(Set dst (LShiftVL src shift));
17709   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed4L" %}
17710   ins_encode %{
17711     int vector_len = 1;
17712     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17713   %}
17714   ins_pipe( pipe_slow );
17715 %}
17716 
17717 instruct vsllv8L_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17718   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17719   match(Set dst (LShiftVL src shift));
17720   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed16I" %}
17721   ins_encode %{
17722     int vector_len = 2;
17723     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17724   %}
17725   ins_pipe( pipe_slow );
17726 %}
17727 
17728 // ------------------- Variable Bit Shift Right Logical -----------------------------
17729 //Integer Variable right shift
17730 instruct vsrlv2I_reg(vecD dst, vecD src, vecD shift) %{
17731   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17732   match(Set dst (URShiftVI src shift));
17733   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed2I" %}
17734   ins_encode %{
17735     int vector_len = 0;
17736     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17737   %}
17738   ins_pipe( pipe_slow );
17739 %}
17740 
17741 instruct vsrlv4I_reg(vecX dst, vecX src, vecX shift) %{
17742   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17743   match(Set dst (URShiftVI src shift));
17744   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17745   ins_encode %{
17746     int vector_len = 0;
17747     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17748   %}
17749   ins_pipe( pipe_slow );
17750 %}
17751 
17752 instruct vsrlv4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17753   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17754   match(Set dst (URShiftVI src shift));
17755   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17756   ins_encode %{
17757     int vector_len = 0;
17758     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17759   %}
17760   ins_pipe( pipe_slow );
17761 %}
17762 
17763 instruct vsrlv8I_reg(vecY dst, vecY src, vecY shift) %{
17764   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17765   match(Set dst (URShiftVI src shift));
17766   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17767   ins_encode %{
17768     int vector_len = 1;
17769     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17770   %}
17771   ins_pipe( pipe_slow );
17772 %}
17773 
17774 instruct vsrlv8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17775   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17776   match(Set dst (URShiftVI src shift));
17777   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17778   ins_encode %{
17779     int vector_len = 1;
17780     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17781   %}
17782   ins_pipe( pipe_slow );
17783 %}
17784 
17785 instruct vsrlv16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17786   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_RShiftCntV);
17787   match(Set dst (URShiftVI src shift));
17788   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed16I" %}
17789   ins_encode %{
17790     int vector_len = 2;
17791     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17792   %}
17793   ins_pipe( pipe_slow );
17794 %}
17795 
17796 //Long Variable right shift
17797 instruct vsrlv1L_reg(vecD dst, vecD src, vecD shift) %{
17798   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17799   match(Set dst (URShiftVL src shift));
17800   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed1L" %}
17801   ins_encode %{
17802     int vector_len = 0;
17803     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17804   %}
17805   ins_pipe( pipe_slow );
17806 %}
17807 
17808 instruct vsrlv2L_reg(vecX dst, vecX src, vecX shift) %{
17809   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17810   match(Set dst (URShiftVL src shift));
17811   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed2L" %}
17812   ins_encode %{
17813     int vector_len = 0;
17814     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17815   %}
17816   ins_pipe( pipe_slow );
17817 %}
17818 
17819 instruct vsrlv2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17820   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17821   match(Set dst (URShiftVL src shift));
17822   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed2L" %}
17823   ins_encode %{
17824     int vector_len = 0;
17825     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17826   %}
17827   ins_pipe( pipe_slow );
17828 %}
17829 
17830 instruct vsrlv4L_reg(vecY dst, vecY src, vecY shift) %{
17831   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17832   match(Set dst (URShiftVL src shift));
17833   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17834   ins_encode %{
17835     int vector_len = 1;
17836     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17837   %}
17838   ins_pipe( pipe_slow );
17839 %}
17840 
17841 instruct vsrlv4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17842   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17843   match(Set dst (URShiftVL src shift));
17844   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17845   ins_encode %{
17846     int vector_len = 1;
17847     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17848   %}
17849   ins_pipe( pipe_slow );
17850 %}
17851 
17852 instruct vsrlv8L_reg(vecZ dst, vecZ src, vecZ shift) %{
17853   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17854   match(Set dst (URShiftVL src shift));
17855   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed8L" %}
17856   ins_encode %{
17857     int vector_len = 2;
17858     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17859   %}
17860   ins_pipe( pipe_slow );
17861 %}
17862 
17863 // ------------------- Variable Bit Shift Right Arithmetic -----------------------------
17864 //Integer Variable right shift
17865 instruct vsrav2I_reg(vecD dst, vecD src, vecD shift) %{
17866   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17867   match(Set dst (RShiftVI src shift));
17868   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed2I" %}
17869   ins_encode %{
17870     int vector_len = 0;
17871     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17872   %}
17873   ins_pipe( pipe_slow );
17874 %}
17875 
17876 instruct vsrav4I_reg(vecX dst, vecX src, vecX shift) %{
17877   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17878   match(Set dst (RShiftVI src shift));
17879   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17880   ins_encode %{
17881     int vector_len = 0;
17882     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17883   %}
17884   ins_pipe( pipe_slow );
17885 %}
17886 
17887 instruct vsrav4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17888   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17889   match(Set dst (RShiftVI src shift));
17890   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17891   ins_encode %{
17892     int vector_len = 0;
17893     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17894   %}
17895   ins_pipe( pipe_slow );
17896 %}
17897 
17898 instruct vsrav8I_reg(vecY dst, vecY src, vecY shift) %{
17899   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17900   match(Set dst (RShiftVI src shift));
17901   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17902   ins_encode %{
17903     int vector_len = 1;
17904     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17905   %}
17906   ins_pipe( pipe_slow );
17907 %}
17908 
17909 instruct vsrav8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17910   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17911   match(Set dst (RShiftVI src shift));
17912   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17913   ins_encode %{
17914     int vector_len = 1;
17915     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17916   %}
17917   ins_pipe( pipe_slow );
17918 %}
17919 
17920 instruct vsrav16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17921   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_RShiftCntV);
17922   match(Set dst (RShiftVI src shift));
17923   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed16I" %}
17924   ins_encode %{
17925     int vector_len = 2;
17926     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17927   %}
17928   ins_pipe( pipe_slow );
17929 %}
17930 
17931 //Long Variable right shift arithmetic
17932 instruct vsrav1L_reg(vecD dst, vecD src, vecD shift, vecD tmp) %{
17933   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17934   match(Set dst (RShiftVL src shift));
17935   effect(TEMP dst, TEMP tmp);
17936   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17937             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17938             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17939             "vpxor     $dst,$dst,$tmp\n\t"
17940             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed1L" %}
17941    ins_encode %{
17942      int vector_len = 0;
17943      __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17944      __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17945      __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17946      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17947      __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17948    %}
17949    ins_pipe( pipe_slow );
17950  %}
17951 
17952 instruct vsrav1L_reg_evex(vecD dst, vecD src, vecD shift) %{
17953   predicate(UseAVX > 2 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17954   match(Set dst (RShiftVL src shift));
17955   format %{ "evpsravq  $dst,$src,$shift\t! variable arithmetic right shift packed1L" %}
17956   ins_encode %{
17957     int vector_len = 0;
17958     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17959   %}
17960   ins_pipe( pipe_slow );
17961 %}
17962 
17963 instruct vsrav2L_reg(vecX dst, vecX src, vecX shift, vecX tmp) %{
17964   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17965   match(Set dst (RShiftVL src shift));
17966   effect(TEMP dst, TEMP tmp);
17967   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17968             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17969             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17970             "vpxor     $dst,$dst,$tmp\n\t"
17971             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed2L" %}
17972   ins_encode %{
17973     int vector_len = 0;
17974     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17975     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17976     __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17977     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17978     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17979   %}
17980   ins_pipe( pipe_slow );
17981 %}
17982 
17983 instruct vsrav2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17984   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17985   match(Set dst (RShiftVL src shift));
17986   format %{ "evpsravq  $dst,$src,$shift\t! variable arithmetic right shift packed2L" %}
17987   ins_encode %{
17988     int vector_len = 0;
17989     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17990   %}
17991   ins_pipe( pipe_slow );
17992 %}
17993 
17994 instruct vsrav4L_reg(vecY dst, vecY src, vecY shift, vecY tmp) %{
17995   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17996   match(Set dst (RShiftVL src shift));
17997   effect(TEMP dst, TEMP tmp);
17998   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17999             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
18000             "vpsrlvq   $tmp,$tmp,$shift\n\t"
18001             "vpxor     $dst,$dst,$tmp\n\t"
18002             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed4L" %}
18003   ins_encode %{
18004     int vector_len = 1;
18005     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
18006     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
18007     __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
18008     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18009     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18010   %}
18011   ins_pipe( pipe_slow );
18012 %}
18013 
18014 instruct vsrav4L_reg_evex(vecY dst, vecY src, vecY shift) %{
18015   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
18016   match(Set dst (RShiftVL src shift));
18017   format %{ "evpsravq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
18018   ins_encode %{
18019     int vector_len = 1;
18020     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
18021   %}
18022   ins_pipe( pipe_slow );
18023 %}
18024 
18025 instruct vsrav8L_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
18026   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
18027   match(Set dst (RShiftVL src shift));
18028   format %{ "evpsravq  $dst,$src,$shift\t! variable bit shift right shift packed8L" %}
18029   ins_encode %{
18030     int vector_len = 2;
18031     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
18032   %}
18033   ins_pipe( pipe_slow );
18034 %}
18035 
18036 // --------------------------------- AND --------------------------------------
18037 
18038 instruct vand4B(vecS dst, vecS src) %{
18039   predicate(n->as_Vector()->length_in_bytes() == 4);
18040   match(Set dst (AndV dst src));
18041   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
18042   ins_encode %{
18043     __ pand($dst$$XMMRegister, $src$$XMMRegister);
18044   %}
18045   ins_pipe( pipe_slow );
18046 %}
18047 
18048 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
18049   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18050   match(Set dst (AndV src1 src2));
18051   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
18052   ins_encode %{
18053     int vector_len = 0;
18054     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18055   %}
18056   ins_pipe( pipe_slow );
18057 %}
18058 
18059 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
18060   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18061   match(Set dst (AndV src (LoadVector mem)));
18062   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
18063   ins_encode %{
18064     int vector_len = 0;
18065     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18066   %}
18067   ins_pipe( pipe_slow );
18068 %}
18069 
18070 instruct vand8B(vecD dst, vecD src) %{
18071   predicate(n->as_Vector()->length_in_bytes() == 8);
18072   match(Set dst (AndV dst src));
18073   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
18074   ins_encode %{
18075     __ pand($dst$$XMMRegister, $src$$XMMRegister);
18076   %}
18077   ins_pipe( pipe_slow );
18078 %}
18079 
18080 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
18081   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
18082   match(Set dst (AndV src1 src2));
18083   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
18084   ins_encode %{
18085     int vector_len = 0;
18086     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18087   %}
18088   ins_pipe( pipe_slow );
18089 %}
18090 
18091 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
18092   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
18093   match(Set dst (AndV src (LoadVector mem)));
18094   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
18095   ins_encode %{
18096     int vector_len = 0;
18097     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18098   %}
18099   ins_pipe( pipe_slow );
18100 %}
18101 
18102 instruct vand16B(vecX dst, vecX src) %{
18103   predicate(n->as_Vector()->length_in_bytes() == 16);
18104   match(Set dst (AndV dst src));
18105   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
18106   ins_encode %{
18107     __ pand($dst$$XMMRegister, $src$$XMMRegister);
18108   %}
18109   ins_pipe( pipe_slow );
18110 %}
18111 
18112 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
18113   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
18114   match(Set dst (AndV src1 src2));
18115   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
18116   ins_encode %{
18117     int vector_len = 0;
18118     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18119   %}
18120   ins_pipe( pipe_slow );
18121 %}
18122 
18123 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
18124   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
18125   match(Set dst (AndV src (LoadVector mem)));
18126   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
18127   ins_encode %{
18128     int vector_len = 0;
18129     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18130   %}
18131   ins_pipe( pipe_slow );
18132 %}
18133 
18134 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
18135   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
18136   match(Set dst (AndV src1 src2));
18137   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
18138   ins_encode %{
18139     int vector_len = 1;
18140     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18141   %}
18142   ins_pipe( pipe_slow );
18143 %}
18144 
18145 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
18146   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
18147   match(Set dst (AndV src (LoadVector mem)));
18148   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
18149   ins_encode %{
18150     int vector_len = 1;
18151     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18152   %}
18153   ins_pipe( pipe_slow );
18154 %}
18155 
18156 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
18157   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18158   match(Set dst (AndV src1 src2));
18159   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
18160   ins_encode %{
18161     int vector_len = 2;
18162     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18163   %}
18164   ins_pipe( pipe_slow );
18165 %}
18166 
18167 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
18168   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18169   match(Set dst (AndV src (LoadVector mem)));
18170   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
18171   ins_encode %{
18172     int vector_len = 2;
18173     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18174   %}
18175   ins_pipe( pipe_slow );
18176 %}
18177 
18178 // --------------------------------- OR ---------------------------------------
18179 
18180 instruct vor4B(vecS dst, vecS src) %{
18181   predicate(n->as_Vector()->length_in_bytes() == 4);
18182   match(Set dst (OrV dst src));
18183   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
18184   ins_encode %{
18185     __ por($dst$$XMMRegister, $src$$XMMRegister);
18186   %}
18187   ins_pipe( pipe_slow );
18188 %}
18189 
18190 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
18191   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18192   match(Set dst (OrV src1 src2));
18193   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
18194   ins_encode %{
18195     int vector_len = 0;
18196     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18197   %}
18198   ins_pipe( pipe_slow );
18199 %}
18200 
18201 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
18202   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18203   match(Set dst (OrV src (LoadVector mem)));
18204   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
18205   ins_encode %{
18206     int vector_len = 0;
18207     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18208   %}
18209   ins_pipe( pipe_slow );
18210 %}
18211 
18212 instruct vor8B(vecD dst, vecD src) %{
18213   predicate(n->as_Vector()->length_in_bytes() == 8);
18214   match(Set dst (OrV dst src));
18215   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
18216   ins_encode %{
18217     __ por($dst$$XMMRegister, $src$$XMMRegister);
18218   %}
18219   ins_pipe( pipe_slow );
18220 %}
18221 
18222 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
18223   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
18224   match(Set dst (OrV src1 src2));
18225   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
18226   ins_encode %{
18227     int vector_len = 0;
18228     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18229   %}
18230   ins_pipe( pipe_slow );
18231 %}
18232 
18233 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
18234   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18235   match(Set dst (OrV src (LoadVector mem)));
18236   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
18237   ins_encode %{
18238     int vector_len = 0;
18239     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18240   %}
18241   ins_pipe( pipe_slow );
18242 %}
18243 
18244 instruct vor16B(vecX dst, vecX src) %{
18245   predicate(n->as_Vector()->length_in_bytes() == 16);
18246   match(Set dst (OrV dst src));
18247   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
18248   ins_encode %{
18249     __ por($dst$$XMMRegister, $src$$XMMRegister);
18250   %}
18251   ins_pipe( pipe_slow );
18252 %}
18253 
18254 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
18255   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
18256   match(Set dst (OrV src1 src2));
18257   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
18258   ins_encode %{
18259     int vector_len = 0;
18260     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18261   %}
18262   ins_pipe( pipe_slow );
18263 %}
18264 
18265 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
18266   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
18267   match(Set dst (OrV src (LoadVector mem)));
18268   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
18269   ins_encode %{
18270     int vector_len = 0;
18271     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18272   %}
18273   ins_pipe( pipe_slow );
18274 %}
18275 
18276 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
18277   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
18278   match(Set dst (OrV src1 src2));
18279   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
18280   ins_encode %{
18281     int vector_len = 1;
18282     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18283   %}
18284   ins_pipe( pipe_slow );
18285 %}
18286 
18287 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
18288   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
18289   match(Set dst (OrV src (LoadVector mem)));
18290   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
18291   ins_encode %{
18292     int vector_len = 1;
18293     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18294   %}
18295   ins_pipe( pipe_slow );
18296 %}
18297 
18298 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
18299   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18300   match(Set dst (OrV src1 src2));
18301   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
18302   ins_encode %{
18303     int vector_len = 2;
18304     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18305   %}
18306   ins_pipe( pipe_slow );
18307 %}
18308 
18309 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
18310   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18311   match(Set dst (OrV src (LoadVector mem)));
18312   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
18313   ins_encode %{
18314     int vector_len = 2;
18315     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18316   %}
18317   ins_pipe( pipe_slow );
18318 %}
18319 
18320 // --------------------------------- XOR --------------------------------------
18321 
18322 instruct vxor4B(vecS dst, vecS src) %{
18323   predicate(n->as_Vector()->length_in_bytes() == 4);
18324   match(Set dst (XorV dst src));
18325   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
18326   ins_encode %{
18327     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
18328   %}
18329   ins_pipe( pipe_slow );
18330 %}
18331 
18332 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
18333   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18334   match(Set dst (XorV src1 src2));
18335   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
18336   ins_encode %{
18337     int vector_len = 0;
18338     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18339   %}
18340   ins_pipe( pipe_slow );
18341 %}
18342 
18343 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
18344   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
18345   match(Set dst (XorV src (LoadVector mem)));
18346   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
18347   ins_encode %{
18348     int vector_len = 0;
18349     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18350   %}
18351   ins_pipe( pipe_slow );
18352 %}
18353 
18354 instruct vxor8B(vecD dst, vecD src) %{
18355   predicate(n->as_Vector()->length_in_bytes() == 8);
18356   match(Set dst (XorV dst src));
18357   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
18358   ins_encode %{
18359     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
18360   %}
18361   ins_pipe( pipe_slow );
18362 %}
18363 
18364 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
18365   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
18366   match(Set dst (XorV src1 src2));
18367   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
18368   ins_encode %{
18369     int vector_len = 0;
18370     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18371   %}
18372   ins_pipe( pipe_slow );
18373 %}
18374 
18375 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
18376   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
18377   match(Set dst (XorV src (LoadVector mem)));
18378   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
18379   ins_encode %{
18380     int vector_len = 0;
18381     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18382   %}
18383   ins_pipe( pipe_slow );
18384 %}
18385 
18386 instruct vxor16B(vecX dst, vecX src) %{
18387   predicate(n->as_Vector()->length_in_bytes() == 16);
18388   match(Set dst (XorV dst src));
18389   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
18390   ins_encode %{
18391     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
18392   %}
18393   ins_pipe( pipe_slow );
18394 %}
18395 
18396 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
18397   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
18398   match(Set dst (XorV src1 src2));
18399   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
18400   ins_encode %{
18401     int vector_len = 0;
18402     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18403   %}
18404   ins_pipe( pipe_slow );
18405 %}
18406 
18407 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
18408   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
18409   match(Set dst (XorV src (LoadVector mem)));
18410   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
18411   ins_encode %{
18412     int vector_len = 0;
18413     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18414   %}
18415   ins_pipe( pipe_slow );
18416 %}
18417 
18418 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
18419   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
18420   match(Set dst (XorV src1 src2));
18421   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
18422   ins_encode %{
18423     int vector_len = 1;
18424     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18425   %}
18426   ins_pipe( pipe_slow );
18427 %}
18428 
18429 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
18430   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
18431   match(Set dst (XorV src (LoadVector mem)));
18432   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
18433   ins_encode %{
18434     int vector_len = 1;
18435     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18436   %}
18437   ins_pipe( pipe_slow );
18438 %}
18439 
18440 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
18441   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18442   match(Set dst (XorV src1 src2));
18443   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
18444   ins_encode %{
18445     int vector_len = 2;
18446     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18447   %}
18448   ins_pipe( pipe_slow );
18449 %}
18450 
18451 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
18452   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18453   match(Set dst (XorV src (LoadVector mem)));
18454   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
18455   ins_encode %{
18456     int vector_len = 2;
18457     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18458   %}
18459   ins_pipe( pipe_slow );
18460 %}
18461 
18462 instruct vcvt4Bto4S_reg(vecD dst, vecS src) %{
18463   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18464   match(Set dst (VectorCastB2X src));
18465   format %{ "vpmovsxbw   $dst,$src\t! convert 4B to 4S vector" %}
18466   ins_encode %{
18467     int vector_len = 0;
18468     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18469   %}
18470   ins_pipe( pipe_slow );
18471 %}
18472 
18473 instruct vcvt8Bto8S_reg(vecX dst, vecD src) %{
18474   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18475   match(Set dst (VectorCastB2X src));
18476   format %{ "vpmovsxbw   $dst,$src\t! convert 8B to 8S vector" %}
18477   ins_encode %{
18478     int vector_len = 0;
18479     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18480   %}
18481   ins_pipe( pipe_slow );
18482 %}
18483 
18484 instruct vcvt16Bto16S_reg(vecY dst, vecX src) %{
18485   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18486   match(Set dst (VectorCastB2X src));
18487   format %{ "vpmovsxbw   $dst,$src\t! convert 16B to 16S vector" %}
18488   ins_encode %{
18489     int vector_len = 1;
18490     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18491   %}
18492   ins_pipe( pipe_slow );
18493 %}
18494 
18495 instruct vcvt32Bto32S_reg(vecZ dst, vecY src) %{
18496   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18497   match(Set dst (VectorCastB2X src));
18498   format %{ "vpmovsxbw   $dst,$src\t! convert 32B to 32S vector" %}
18499   ins_encode %{
18500     int vector_len = 2;
18501     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18502   %}
18503   ins_pipe( pipe_slow );
18504 %}
18505 
18506 instruct vcvt4Bto4I_reg(vecX dst, vecS src) %{
18507   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18508   match(Set dst (VectorCastB2X src));
18509   format %{ "vpmovsxbd   $dst,$src\t! convert 4B to 4I vector" %}
18510   ins_encode %{
18511     int vector_len = 0;
18512     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18513   %}
18514   ins_pipe( pipe_slow );
18515 %}
18516 
18517 instruct vcvt8Bto8I_reg(vecY dst, vecD src) %{
18518   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18519   match(Set dst (VectorCastB2X src));
18520   format %{ "vpmovsxbd   $dst,$src\t! convert 8B to 8I vector" %}
18521   ins_encode %{
18522     int vector_len = 1;
18523     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18524   %}
18525   ins_pipe( pipe_slow );
18526 %}
18527 
18528 instruct vcvt16Bto16I_reg(vecZ dst, vecX src) %{
18529   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18530   match(Set dst (VectorCastB2X src));
18531   format %{ "vpmovsxbd   $dst,$src\t! convert 16B to 16I vector" %}
18532   ins_encode %{
18533     int vector_len = 2;
18534     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18535   %}
18536   ins_pipe( pipe_slow );
18537 %}
18538 
18539 instruct vcvt4Bto4L_reg(vecY dst, vecS src) %{
18540   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18541   match(Set dst (VectorCastB2X src));
18542   format %{ "vpmovsxbq   $dst,$src\t! convert 4B to 4L vector" %}
18543   ins_encode %{
18544     int vector_len = 1;
18545     __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18546   %}
18547   ins_pipe( pipe_slow );
18548 %}
18549 
18550 instruct vcvt8Bto8L_reg(vecZ dst, vecD src) %{
18551   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18552   match(Set dst (VectorCastB2X src));
18553   format %{ "vpmovsxbq   $dst,$src\t! convert 8B to 8L vector" %}
18554   ins_encode %{
18555     int vector_len = 2;
18556     __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18557   %}
18558   ins_pipe( pipe_slow );
18559 %}
18560 
18561 instruct vcvt4Bto4F_reg(vecX dst, vecS src) %{
18562   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18563   match(Set dst (VectorCastB2X src));
18564   format %{ "vpmovsxbd   $dst,$src\n\t"
18565             "vcvtdq2ps   $dst,$dst\t! convert 4B to 4F vector" %}
18566   ins_encode %{
18567     int vector_len = 0;
18568     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18569     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18570   %}
18571   ins_pipe( pipe_slow );
18572 %}
18573 
18574 instruct vcvt8Bto8F_reg(vecY dst, vecD src) %{
18575   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18576   match(Set dst (VectorCastB2X src));
18577   format %{ "vpmovsxbd   $dst,$src\n\t"
18578             "vcvtdq2ps   $dst,$dst\t! convert 8B to 8F vector" %}
18579   ins_encode %{
18580     int vector_len = 1;
18581     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18582     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18583   %}
18584   ins_pipe( pipe_slow );
18585 %}
18586 
18587 instruct vcvt16Bto16F_reg(vecZ dst, vecX src) %{
18588   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18589   match(Set dst (VectorCastB2X src));
18590   format %{ "vpmovsxbd   $dst,$src\n\t"
18591             "vcvtdq2ps   $dst,$dst\t! convert 16B to 16F vector" %}
18592   ins_encode %{
18593     int vector_len = 2;
18594     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18595     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18596   %}
18597   ins_pipe( pipe_slow );
18598 %}
18599 
18600 instruct vcvt4Bto4D_reg(vecY dst, vecS src) %{
18601   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18602   match(Set dst (VectorCastB2X src));
18603   format %{ "vpmovsxbd   $dst,$src\n\t"
18604             "vcvtdq2pd   $dst,$dst\t! convert 4B to 4D vector" %}
18605   ins_encode %{
18606     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, 0);
18607     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, 1);
18608   %}
18609   ins_pipe( pipe_slow );
18610 %}
18611 
18612 instruct vcvt8Bto8D_reg(vecZ dst, vecD src) %{
18613   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18614   match(Set dst (VectorCastB2X src));
18615   format %{ "vpmovsxbd   $dst,$src\n\t"
18616             "vcvtdq2pd   $dst,$dst\t! convert 8B to 8D vector" %}
18617   ins_encode %{
18618     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, 1);
18619     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, 2);
18620   %}
18621   ins_pipe( pipe_slow );
18622 %}
18623 
18624 instruct vcvt4Sto4B_reg(vecS dst, vecD src, rRegL scratch) %{
18625   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18626   effect(TEMP scratch);
18627   match(Set dst (VectorCastS2X src));
18628   format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18629             "vpackuswb  $dst,$dst\t! convert 4S to 4B vector" %}
18630   ins_encode %{
18631     int vector_len = 0;
18632     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18633     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18634   %}
18635   ins_pipe( pipe_slow );
18636 %}
18637 
18638 instruct vcvt8Sto8B_reg(vecD dst, vecX src, rRegL scratch) %{
18639   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18640   effect(TEMP scratch);
18641   match(Set dst (VectorCastS2X src));
18642   format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18643             "vpackuswb  $dst,$dst\t! convert 8S to 8B vector" %}
18644   ins_encode %{
18645     int vector_len = 0;
18646     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18647     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18648   %}
18649   ins_pipe( pipe_slow );
18650 %}
18651 
18652 instruct vcvt16Sto16B_reg(vecX dst, vecY src, vecY tmp, rRegL scratch) %{
18653   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18654   effect(TEMP scratch, TEMP tmp);
18655   match(Set dst (VectorCastS2X src));
18656     format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18657               "vextracti128 $tmp,$dst,0x1\n\t"
18658               "vpackuswb  $dst,$dst,$tmp\t! convert 16S to 16B vector" %}
18659   ins_encode %{
18660     int vector_len = 1;
18661     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18662     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18663     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18664   %}
18665   ins_pipe( pipe_slow );
18666 %}
18667 
18668 instruct vcvt32Sto32B_reg(vecY dst, vecZ src) %{
18669   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18670   match(Set dst (VectorCastS2X src));
18671     format %{ "evpmovwb   $dst,$src\t! convert 32S to 32B vector" %}
18672   ins_encode %{
18673     int vector_len = 2;
18674     __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18675   %}
18676   ins_pipe( pipe_slow );
18677 %}
18678 
18679 instruct vcvt2Sto2I_reg(vecD dst, vecS src) %{
18680   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18681   match(Set dst (VectorCastS2X src));
18682   format %{ "vpmovsxwd   $dst,$src\t! convert 2S to 2I vector" %}
18683   ins_encode %{
18684     int vector_len = 0;
18685     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18686   %}
18687   ins_pipe( pipe_slow );
18688 %}
18689 
18690 instruct vcvt4Sto4I_reg(vecX dst, vecD src) %{
18691   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18692   match(Set dst (VectorCastS2X src));
18693   format %{ "vpmovsxwd   $dst,$src\t! convert 4S to 4I vector" %}
18694   ins_encode %{
18695     int vector_len = 0;
18696     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18697   %}
18698   ins_pipe( pipe_slow );
18699 %}
18700 
18701 instruct vcvt8Sto8I_reg(vecY dst, vecX src) %{
18702   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18703   match(Set dst (VectorCastS2X src));
18704   format %{ "vpmovsxwd   $dst,$src\t! convert 8S to 8I vector" %}
18705   ins_encode %{
18706     int vector_len = 1;
18707     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18708   %}
18709   ins_pipe( pipe_slow );
18710 %}
18711 
18712 instruct vcvt16Sto16I_reg(vecZ dst, vecY src) %{
18713   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18714   match(Set dst (VectorCastS2X src));
18715   format %{ "vpmovsxwd   $dst,$src\t! convert 16S to 16I vector" %}
18716   ins_encode %{
18717     int vector_len = 2;
18718     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18719   %}
18720   ins_pipe( pipe_slow );
18721 %}
18722 
18723 instruct vcvt2Sto2L_reg(vecX dst, vecS src) %{
18724   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18725   match(Set dst (VectorCastS2X src));
18726   format %{ "vpmovsxwq   $dst,$src\t! convert 2S to 2L vector" %}
18727   ins_encode %{
18728     int vector_len = 0;
18729     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18730   %}
18731   ins_pipe( pipe_slow );
18732 %}
18733 
18734 instruct vcvt4Sto4L_reg(vecY dst, vecD src) %{
18735   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18736   match(Set dst (VectorCastS2X src));
18737   format %{ "vpmovsxwq   $dst,$src\t! convert 4S to 4L vector" %}
18738   ins_encode %{
18739     int vector_len = 1;
18740     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18741   %}
18742   ins_pipe( pipe_slow );
18743 %}
18744 
18745 instruct vcvt8Sto8L_reg(vecZ dst, vecX src) %{
18746   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18747   match(Set dst (VectorCastS2X src));
18748   format %{ "vpmovsxwq   $dst,$src\t! convert 8S to 8L vector" %}
18749   ins_encode %{
18750     int vector_len = 2;
18751     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18752   %}
18753   ins_pipe( pipe_slow );
18754 %}
18755 
18756 instruct vcvt2Sto2F_reg(vecD dst, vecS src) %{
18757   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18758   match(Set dst (VectorCastS2X src));
18759   format %{ "vpmovsxwd   $dst,$src\n\t"
18760             "vcvtdq2ps   $dst,$dst\t! convert 2S to 2F vector" %}
18761   ins_encode %{
18762     int vector_len = 0;
18763     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18764     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18765   %}
18766   ins_pipe( pipe_slow );
18767 %}
18768 
18769 instruct vcvt4Sto4F_reg(vecX dst, vecD src) %{
18770   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18771   match(Set dst (VectorCastS2X src));
18772   format %{ "vpmovsxwd   $dst,$src\n\t"
18773             "vcvtdq2ps   $dst,$dst\t! convert 4S to 4F vector" %}
18774   ins_encode %{
18775     int vector_len = 0;
18776     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18777     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18778   %}
18779   ins_pipe( pipe_slow );
18780 %}
18781 
18782 instruct vcvt8Sto8F_reg(vecY dst, vecX src) %{
18783   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18784   match(Set dst (VectorCastS2X src));
18785   format %{ "vpmovsxwd   $dst,$src\n\t"
18786             "vcvtdq2ps   $dst,$dst\t! convert 8S to 8F vector" %}
18787   ins_encode %{
18788     int vector_len = 1;
18789     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18790     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18791   %}
18792   ins_pipe( pipe_slow );
18793 %}
18794 
18795 instruct vcvt16Sto16F_reg(vecZ dst, vecY src) %{
18796   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18797   match(Set dst (VectorCastS2X src));
18798   format %{ "vpmovsxwd   $dst,$src\n\t"
18799             "vcvtdq2ps   $dst,$dst\t! convert 16S to 16F vector" %}
18800   ins_encode %{
18801     int vector_len = 2;
18802     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18803     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18804   %}
18805   ins_pipe( pipe_slow );
18806 %}
18807 
18808 instruct vcvt2Sto2D_reg(vecX dst, vecS src) %{
18809   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18810   match(Set dst (VectorCastS2X src));
18811   format %{ "vpmovsxwd   $dst,$src\n\t"
18812             "vcvtdq2pd   $dst,$dst\t! convert 2S to 2D vector" %}
18813   ins_encode %{
18814     int vector_len = 0;
18815     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18816     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18817   %}
18818   ins_pipe( pipe_slow );
18819 %}
18820 
18821 instruct vcvt4Sto4D_reg(vecY dst, vecD src) %{
18822   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18823   match(Set dst (VectorCastS2X src));
18824   format %{ "vpmovsxwd   $dst,$src\n\t"
18825             "vcvtdq2pd   $dst,$dst\t! convert 4S to 4D vector" %}
18826   ins_encode %{
18827     int vector_len = 1;
18828     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18829     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18830   %}
18831   ins_pipe( pipe_slow );
18832 %}
18833 
18834 instruct vcvt8Sto8D_reg(vecZ dst, vecX src) %{
18835   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18836   match(Set dst (VectorCastS2X src));
18837   format %{ "vpmovsxwd   $dst,$src\n\t"
18838             "vcvtdq2pd   $dst,$dst\t! convert 8S to 8D vector" %}
18839   ins_encode %{
18840     int vector_len = 2;
18841     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18842     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18843   %}
18844   ins_pipe( pipe_slow );
18845 %}
18846 
18847 instruct vcvt4Ito4B_reg(vecS dst, vecX src, rRegL scratch) %{
18848   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18849   effect(TEMP scratch);
18850   match(Set dst (VectorCastI2X src));
18851   format %{ "vpand      $dst,$src,[0x000000FF000000FF]\n\t"
18852             "vpackusdw  $dst,$dst\n\t"
18853             "vpackuswb  $dst,$dst\t! convert 4I to 4B vector" %}
18854   ins_encode %{
18855     int vector_len = 0;
18856     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18857     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18858     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18859   %}
18860   ins_pipe( pipe_slow );
18861 %}
18862 
18863 instruct vcvt8Ito8B_reg(vecD dst, vecY src, vecY tmp, rRegL scratch) %{
18864   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18865   effect(TEMP scratch, TEMP tmp);
18866   match(Set dst (VectorCastI2X src));
18867   format %{ "vpand      $dst,$src,[0x000000FF000000FF]\n\t"
18868             "vextracti128 $tmp,$dst,0x1\n\t"
18869             "vpackusdw  $dst,$dst,$tmp\n\t"
18870             "vpackuswb  $dst,$dst\t! convert 8I to 8B vector" %}
18871   ins_encode %{
18872     int vector_len = 1;
18873     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18874     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18875     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18876     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
18877   %}
18878   ins_pipe( pipe_slow );
18879 %}
18880 
18881 instruct vcvt16Ito16B_reg(vecX dst, vecZ src) %{
18882   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18883   match(Set dst (VectorCastI2X src));
18884     format %{ "evpmovdb   $dst,$src\t! convert 16I to 16B vector" %}
18885   ins_encode %{
18886     int vector_len = 2;
18887     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18888   %}
18889   ins_pipe( pipe_slow );
18890 %}
18891 
18892 instruct vcvt2Ito2S_reg(vecS dst, vecD src, rRegL scratch) %{
18893   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18894   effect(TEMP scratch);
18895   match(Set dst (VectorCastI2X src));
18896   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18897             "vpackusdw  $dst,$dst\t! convert 2I to 2S vector" %}
18898   ins_encode %{
18899     int vector_len = 0;
18900     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18901     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18902   %}
18903   ins_pipe( pipe_slow );
18904 %}
18905 
18906 instruct vcvt4Ito4S_reg(vecD dst, vecX src, rRegL scratch) %{
18907   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18908   effect(TEMP scratch);
18909   match(Set dst (VectorCastI2X src));
18910   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18911             "vpackusdw  $dst,$dst\t! convert 4I to 4S vector" %}
18912   ins_encode %{
18913     int vector_len = 0;
18914     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18915     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18916   %}
18917   ins_pipe( pipe_slow );
18918 %}
18919 
18920 instruct vcvt8Ito8S_reg(vecX dst, vecY src, vecY tmp, rRegL scratch) %{
18921   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18922   effect(TEMP scratch, TEMP tmp);
18923   match(Set dst (VectorCastI2X src));
18924   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18925             "vextracti128 $tmp,$dst,0x1\n\t"
18926             "vpackusdw  $dst,$dst,$tmp\t! convert 8I to 8S vector" %}
18927   ins_encode %{
18928     int vector_len = 1;
18929     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18930     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18931     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18932   %}
18933   ins_pipe( pipe_slow );
18934 %}
18935 
18936 instruct vcvt16Ito16S_reg(vecY dst, vecZ src) %{
18937   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18938   match(Set dst (VectorCastI2X src));
18939     format %{ "evpmovdw   $dst,$src\t! convert 16I to 16S vector" %}
18940   ins_encode %{
18941     int vector_len = 2;
18942     __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18943   %}
18944   ins_pipe( pipe_slow );
18945 %}
18946 
18947 instruct vcvt2Ito2L_reg(vecX dst, vecD src) %{
18948   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18949   match(Set dst (VectorCastI2X src));
18950   format %{ "vpmovsxdq   $dst,$src\t! convert 2I to 2L vector" %}
18951   ins_encode %{
18952     int vector_len = 0;
18953     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18954   %}
18955   ins_pipe( pipe_slow );
18956 %}
18957 
18958 instruct vcvt4Ito4L_reg(vecY dst, vecX src) %{
18959   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18960   match(Set dst (VectorCastI2X src));
18961   format %{ "vpmovsxdq   $dst,$src\t! convert 4I to 4L vector" %}
18962   ins_encode %{
18963     int vector_len = 1;
18964     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18965   %}
18966   ins_pipe( pipe_slow );
18967 %}
18968 
18969 instruct vcvt8Ito8L_reg(vecZ dst, vecY src) %{
18970   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18971   match(Set dst (VectorCastI2X src));
18972   format %{ "vpmovsxdq   $dst,$src\t! convert 8I to 8L vector" %}
18973   ins_encode %{
18974     int vector_len = 2;
18975     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18976   %}
18977   ins_pipe( pipe_slow );
18978 %}
18979 
18980 instruct vcvt2Ito2F_reg(vecD dst, vecD src) %{
18981   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18982   match(Set dst (VectorCastI2X src));
18983   format %{ "vcvtdq2ps   $dst,$src\t! convert 2I to 2F vector" %}
18984   ins_encode %{
18985     int vector_len = 0;
18986     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18987   %}
18988   ins_pipe( pipe_slow );
18989 %}
18990 
18991 instruct vcvt4Ito4F_reg(vecX dst, vecX src) %{
18992   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18993   match(Set dst (VectorCastI2X src));
18994   format %{ "vcvtdq2ps   $dst,$src\t! convert 4I to 4F vector" %}
18995   ins_encode %{
18996     int vector_len = 0;
18997     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18998   %}
18999   ins_pipe( pipe_slow );
19000 %}
19001 
19002 instruct vcvt8Ito8F_reg(vecY dst, vecY src) %{
19003   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19004   match(Set dst (VectorCastI2X src));
19005   format %{ "vcvtdq2ps   $dst,$src\t! convert 8I to 8F vector" %}
19006   ins_encode %{
19007     int vector_len = 1;
19008     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19009   %}
19010   ins_pipe( pipe_slow );
19011 %}
19012 
19013 instruct vcvt16Ito16F_reg(vecY dst, vecY src) %{
19014   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19015   match(Set dst (VectorCastI2X src));
19016   format %{ "vcvtdq2ps   $dst,$src\t! convert 16I to 16F vector" %}
19017   ins_encode %{
19018     int vector_len = 2;
19019     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19020   %}
19021   ins_pipe( pipe_slow );
19022 %}
19023 
19024 instruct vcvt2Ito2D_reg(vecX dst, vecD src) %{
19025   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19026   match(Set dst (VectorCastI2X src));
19027   format %{ "vcvtdq2pd   $dst,$src\t! convert 2I to 2D vector" %}
19028   ins_encode %{
19029     int vector_len = 0;
19030     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19031   %}
19032   ins_pipe( pipe_slow );
19033 %}
19034 
19035 instruct vcvt4Ito4D_reg(vecY dst, vecX src) %{
19036   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19037   match(Set dst (VectorCastI2X src));
19038   format %{ "vcvtdq2pd   $dst,$src\t! convert 4I to 4D vector" %}
19039   ins_encode %{
19040     int vector_len = 1;
19041     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19042   %}
19043   ins_pipe( pipe_slow );
19044 %}
19045 
19046 instruct vcvt8Ito8D_reg(vecZ dst, vecY src) %{
19047   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19048   match(Set dst (VectorCastI2X src));
19049   format %{ "vcvtdq2pd   $dst,$src\t! convert 8I to 8D vector" %}
19050   ins_encode %{
19051     int vector_len = 2;
19052     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19053   %}
19054   ins_pipe( pipe_slow );
19055 %}
19056 
19057 instruct vcvt4Lto4B_reg(vecS dst, vecY src, rRegL scratch) %{
19058   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
19059   match(Set dst (VectorCastL2X src));
19060   effect(TEMP scratch);
19061   format %{ "vpermilps  $dst,$src,8\n\t"
19062             "vpermpd    $dst,$dst,8\n\t"
19063             "vpand      $dst,$dst,[0x000000FF000000FF]\n\t"
19064             "vpackusdw  $dst,$dst\n\t"
19065             "vpackuswb  $dst,$dst\t! convert 4L to 4B vector" %}
19066   ins_encode %{
19067     int vector_len = 1;
19068     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
19069     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
19070     // Since cast to int has been done, do rest of operations in 128.
19071     vector_len = 0;
19072     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
19073     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
19074     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
19075   %}
19076   ins_pipe( pipe_slow );
19077 %}
19078 
19079 instruct vcvt8Lto8B_reg(vecD dst, vecZ src) %{
19080   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
19081   match(Set dst (VectorCastL2X src));
19082     format %{ "evpmovqb   $dst,$src\t! convert 8L to 8B vector" %}
19083   ins_encode %{
19084     int vector_len = 2;
19085     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19086   %}
19087   ins_pipe( pipe_slow );
19088 %}
19089 
19090 instruct vcvt2Lto2S_reg(vecS dst, vecX src, rRegL scratch) %{
19091   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
19092   match(Set dst (VectorCastL2X src));
19093   effect(TEMP scratch);
19094   format %{ "vpshufd    $dst,$src,8\n\t"
19095             "vpand      $dst,$dst,[0x0000FFFF0000FFFF]\n\t"
19096             "vpackusdw  $dst,$dst\t! convert 2L to 2S vector" %}
19097   ins_encode %{
19098     int vector_len = 0;
19099     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
19100     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
19101     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
19102   %}
19103   ins_pipe( pipe_slow );
19104 %}
19105 
19106 instruct vcvt4Lto4S_reg(vecD dst, vecY src, rRegL scratch) %{
19107   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
19108   match(Set dst (VectorCastL2X src));
19109   effect(TEMP scratch);
19110   format %{ "vpermilps  $dst,$src,8\n\t"
19111             "vpermpd    $dst,$dst,8\n\t"
19112             "vpand      $dst,$dst,[0x0000FFFF0000FFFF]\n\t"
19113             "vpackusdw  $dst,$dst\t! convert 4L to 4S vector" %}
19114   ins_encode %{
19115     int vector_len = 1;
19116     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
19117     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
19118     // Since cast to int has been done, do rest of operations in 128.
19119     vector_len = 0;
19120     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
19121     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
19122   %}
19123   ins_pipe( pipe_slow );
19124 %}
19125 
19126 instruct vcvt8Lto8S_reg(vecX dst, vecZ src) %{
19127   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
19128   match(Set dst (VectorCastL2X src));
19129     format %{ "evpmovqw   $dst,$src\t! convert 8L to 8S vector" %}
19130   ins_encode %{
19131     int vector_len = 2;
19132     __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19133   %}
19134   ins_pipe( pipe_slow );
19135 %}
19136 
19137 instruct vcvt1Lto1I_reg(vecS dst, vecD src) %{
19138   predicate(n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
19139   match(Set dst (VectorCastL2X src));
19140   format %{ "movdqu   $dst,$src\t! convert 1L to 1I vector" %}
19141   ins_encode %{
19142     // If register is the same, then move is not needed.
19143     if ($dst$$XMMRegister != $src$$XMMRegister) {
19144       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
19145     }
19146   %}
19147   ins_pipe( pipe_slow );
19148 %}
19149 
19150 instruct vcvt2Lto2I_reg(vecD dst, vecX src) %{
19151   predicate(UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
19152   match(Set dst (VectorCastL2X src));
19153   format %{ "pshufd   $dst,$src,8\t! convert 2L to 2I vector" %}
19154   ins_encode %{
19155     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
19156   %}
19157   ins_pipe( pipe_slow );
19158 %}
19159 
19160 instruct vcvt2Lto2I_reg_avx(vecD dst, vecX src) %{
19161   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
19162   match(Set dst (VectorCastL2X src));
19163   format %{ "vpshufd   $dst,$src,8\t! convert 2L to 2I vector" %}
19164   ins_encode %{
19165     int vector_len = 0;
19166     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
19167   %}
19168   ins_pipe( pipe_slow );
19169 %}
19170 
19171 instruct vcvt4Lto4I_reg(vecX dst, vecY src) %{
19172   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
19173   match(Set dst (VectorCastL2X src));
19174   format %{ "vpermilps  $dst,$src,8\n\t"
19175           "vpermpd  $dst,$dst,8\t! convert 4L to 4I vector" %}
19176   ins_encode %{
19177     int vector_len = 1;
19178     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
19179     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
19180   %}
19181   ins_pipe( pipe_slow );
19182 %}
19183 
19184 instruct vcvt8Lto8I_reg(vecY dst, vecZ src) %{
19185   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
19186   match(Set dst (VectorCastL2X src));
19187     format %{ "evpmovqd   $dst,$src\t! convert 8L to 8I vector" %}
19188   ins_encode %{
19189     int vector_len = 2;
19190     __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19191   %}
19192   ins_pipe( pipe_slow );
19193 %}
19194 
19195 instruct vcvt2Lto2F_reg(vecD dst, vecX src) %{
19196   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19197   match(Set dst (VectorCastL2X src));
19198   format %{ "vcvtqq2ps   $dst,$src\t! convert 2L to 2F vector" %}
19199   ins_encode %{
19200     int vector_len = 0;
19201     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19202   %}
19203   ins_pipe( pipe_slow );
19204 %}
19205 
19206 instruct vcvt4Lto4F_reg(vecX dst, vecY src) %{
19207   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19208   match(Set dst (VectorCastL2X src));
19209   format %{ "vcvtqq2ps   $dst,$src\t! convert 4L to 4F vector" %}
19210   ins_encode %{
19211     int vector_len = 1;
19212     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19213   %}
19214   ins_pipe( pipe_slow );
19215 %}
19216 
19217 instruct vcvt8Lto8F_reg(vecY dst, vecZ src) %{
19218   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19219   match(Set dst (VectorCastL2X src));
19220   format %{ "vcvtqq2ps   $dst,$src\t! convert 8L to 8F vector" %}
19221   ins_encode %{
19222     int vector_len = 2;
19223     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19224   %}
19225   ins_pipe( pipe_slow );
19226 %}
19227 
19228 instruct vcvt1Lto1D_reg(vecD dst, vecD src) %{
19229   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19230   match(Set dst (VectorCastL2X src));
19231   format %{ "vcvtqq2pd   $dst,$src\t! convert 1L to 1D vector" %}
19232   ins_encode %{
19233     int vector_len = 0;
19234     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19235   %}
19236   ins_pipe( pipe_slow );
19237 %}
19238 
19239 instruct vcvt2Lto2D_reg(vecX dst, vecX src) %{
19240   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19241   match(Set dst (VectorCastL2X src));
19242   format %{ "vcvtqq2pd   $dst,$src\t! convert 2L to 2D vector" %}
19243   ins_encode %{
19244     int vector_len = 0;
19245     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19246   %}
19247   ins_pipe( pipe_slow );
19248 %}
19249 
19250 instruct vcvt4Lto4D_reg(vecY dst, vecY src) %{
19251   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19252   match(Set dst (VectorCastL2X src));
19253   format %{ "vcvtqq2pd   $dst,$src\t! convert 4L to 4D vector" %}
19254   ins_encode %{
19255     int vector_len = 1;
19256     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19257   %}
19258   ins_pipe( pipe_slow );
19259 %}
19260 
19261 instruct vcvt8Lto8D_reg(vecZ dst, vecZ src) %{
19262   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19263   match(Set dst (VectorCastL2X src));
19264   format %{ "vcvtqq2pd   $dst,$src\t! convert 8L to 8D vector" %}
19265   ins_encode %{
19266     int vector_len = 2;
19267     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19268   %}
19269   ins_pipe( pipe_slow );
19270 %}
19271 
19272 instruct vcvt2Fto2D_reg(vecX dst, vecD src) %{
19273   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19274   match(Set dst (VectorCastF2X src));
19275   format %{ "vcvtps2pd   $dst,$src\t! convert 2F to 2D vector" %}
19276   ins_encode %{
19277     int vector_len = 0;
19278     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19279   %}
19280   ins_pipe( pipe_slow );
19281 %}
19282 
19283 instruct vcvt4Fto4D_reg(vecY dst, vecX src) %{
19284   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19285   match(Set dst (VectorCastF2X src));
19286   format %{ "vcvtps2pd   $dst,$src\t! convert 4F to 4D vector" %}
19287   ins_encode %{
19288     int vector_len = 1;
19289     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19290   %}
19291   ins_pipe( pipe_slow );
19292 %}
19293 
19294 instruct vcvt8Fto8D_reg(vecZ dst, vecY src) %{
19295   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19296   match(Set dst (VectorCastF2X src));
19297   format %{ "evcvtps2pd   $dst,$src\t! convert 8F to 8D vector" %}
19298   ins_encode %{
19299     int vector_len = 2;
19300     __ evcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19301   %}
19302   ins_pipe( pipe_slow );
19303 %}
19304 
19305 instruct vcvt2Dto2F_reg(vecD dst, vecX src) %{
19306   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19307   match(Set dst (VectorCastD2X src));
19308   format %{ "vcvtpd2ps   $dst,$src\t! convert 2D to 2F vector" %}
19309   ins_encode %{
19310     int vector_len = 0;
19311     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19312   %}
19313   ins_pipe( pipe_slow );
19314 %}
19315 
19316 instruct vcvt4Dto4F_reg(vecX dst, vecY src) %{
19317   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19318   match(Set dst (VectorCastD2X src));
19319   format %{ "vcvtpd2ps   $dst,$src\t! convert 4D to 4F vector" %}
19320   ins_encode %{
19321     int vector_len = 1;
19322     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19323   %}
19324   ins_pipe( pipe_slow );
19325 %}
19326 
19327 instruct vcvt8Dto8F_reg(vecY dst, vecZ src) %{
19328   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19329   match(Set dst (VectorCastD2X src));
19330   format %{ "evcvtpd2ps   $dst,$src\t! convert 8D to 8F vector" %}
19331   ins_encode %{
19332     int vector_len = 2;
19333     __ evcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
19334   %}
19335   ins_pipe( pipe_slow );
19336 %}
19337 
19338 instruct vcmpeq2F(vecD dst, vecD src1, vecD src2) %{
19339   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19340             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19341             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19342   match(Set dst (VectorMaskCmp src1 src2));
19343   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed2F" %}
19344   ins_encode %{
19345     int vector_len = 0;
19346     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19347     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19348   %}
19349   ins_pipe( pipe_slow );
19350 %}
19351 
19352 instruct vcmpeq4F(vecX dst, vecX src1, vecX src2) %{
19353   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19354             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19355             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19356   match(Set dst (VectorMaskCmp src1 src2));
19357   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed4F" %}
19358   ins_encode %{
19359     int vector_len = 0;
19360     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19361     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19362   %}
19363   ins_pipe( pipe_slow );
19364 %}
19365 
19366 instruct vcmpeq8F(vecY dst, vecY src1, vecY src2) %{
19367   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19368             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19369             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19370   match(Set dst (VectorMaskCmp src1 src2));
19371   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed8F" %}
19372   ins_encode %{
19373     int vector_len = 1;
19374     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19375     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19376   %}
19377   ins_pipe( pipe_slow );
19378 %}
19379 
19380 instruct vcmpeq16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19381   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19382             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19383             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19384   match(Set dst (VectorMaskCmp src1 src2));
19385   effect(TEMP dst, TEMP scratch);
19386   format %{ "vcmpeqps  k2,$src1,$src2\n\t"
19387             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16F" %}
19388   ins_encode %{
19389     int vector_len = 2;
19390     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19391     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19392     KRegister mask = k0; // The comparison itself is not being masked.
19393     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19394     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19395   %}
19396   ins_pipe( pipe_slow );
19397 %}
19398 
19399 instruct vcmplt2F(vecD dst, vecD src1, vecD src2) %{
19400   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19401             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19402             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19403   match(Set dst (VectorMaskCmp src1 src2));
19404   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed2F" %}
19405   ins_encode %{
19406     int vector_len = 0;
19407     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19408     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19409   %}
19410   ins_pipe( pipe_slow );
19411 %}
19412 
19413 instruct vcmplt4F(vecX dst, vecX src1, vecX src2) %{
19414   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19415             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19416             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19417   match(Set dst (VectorMaskCmp src1 src2));
19418   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed4F" %}
19419   ins_encode %{
19420     int vector_len = 0;
19421     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19422     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19423   %}
19424   ins_pipe( pipe_slow );
19425 %}
19426 
19427 instruct vcmplt8F(vecY dst, vecY src1, vecY src2) %{
19428   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19429             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19430             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19431   match(Set dst (VectorMaskCmp src1 src2));
19432   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed8F" %}
19433   ins_encode %{
19434     int vector_len = 1;
19435     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19436     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19437   %}
19438   ins_pipe( pipe_slow );
19439 %}
19440 
19441 instruct vcmplt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19442   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19443             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19444             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19445   match(Set dst (VectorMaskCmp src1 src2));
19446   effect(TEMP dst, TEMP scratch);
19447   format %{ "vcmpltps  k2,$src1,$src2\n\t"
19448             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed16F" %}
19449   ins_encode %{
19450     int vector_len = 2;
19451     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19452     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19453     KRegister mask = k0; // The comparison itself is not being masked.
19454     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19455     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19456   %}
19457   ins_pipe( pipe_slow );
19458 %}
19459 
19460 instruct vcmpgt2F(vecD dst, vecD src1, vecD src2) %{
19461   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19462             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19463             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19464   match(Set dst (VectorMaskCmp src1 src2));
19465   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed2F" %}
19466   ins_encode %{
19467     int vector_len = 0;
19468     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19469     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19470   %}
19471   ins_pipe( pipe_slow );
19472 %}
19473 
19474 instruct vcmpgt4F(vecX dst, vecX src1, vecX src2) %{
19475   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19476             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19477             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19478   match(Set dst (VectorMaskCmp src1 src2));
19479   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed4F" %}
19480   ins_encode %{
19481     int vector_len = 0;
19482     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19483     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19484   %}
19485   ins_pipe( pipe_slow );
19486 %}
19487 
19488 instruct vcmpgt8F(vecY dst, vecY src1, vecY src2) %{
19489   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19490             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19491             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19492   match(Set dst (VectorMaskCmp src1 src2));
19493   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed8F" %}
19494   ins_encode %{
19495     int vector_len = 1;
19496     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19497     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19498   %}
19499   ins_pipe( pipe_slow );
19500 %}
19501 
19502 instruct vcmpgt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19503   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19504             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19505             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19506   match(Set dst (VectorMaskCmp src1 src2));
19507   effect(TEMP dst, TEMP scratch);
19508   format %{ "vcmpgtps  k2,$src1,$src2\n\t"
19509             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16F" %}
19510   ins_encode %{
19511     int vector_len = 2;
19512     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19513     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19514     KRegister mask = k0; // The comparison itself is not being masked.
19515     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19516     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19517   %}
19518   ins_pipe( pipe_slow );
19519 %}
19520 
19521 instruct vcmpge2F(vecD dst, vecD src1, vecD src2) %{
19522   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19523             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19524             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19525   match(Set dst (VectorMaskCmp src1 src2));
19526   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed2F" %}
19527   ins_encode %{
19528     int vector_len = 0;
19529     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19530     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19531   %}
19532   ins_pipe( pipe_slow );
19533 %}
19534 
19535 instruct vcmpge4F(vecX dst, vecX src1, vecX src2) %{
19536   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19537             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19538             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19539   match(Set dst (VectorMaskCmp src1 src2));
19540   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed4F" %}
19541   ins_encode %{
19542     int vector_len = 0;
19543     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19544     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19545   %}
19546   ins_pipe( pipe_slow );
19547 %}
19548 
19549 instruct vcmpge8F(vecY dst, vecY src1, vecY src2) %{
19550   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19551             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19552             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19553   match(Set dst (VectorMaskCmp src1 src2));
19554   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed8F" %}
19555   ins_encode %{
19556     int vector_len = 1;
19557     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19558     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19559   %}
19560   ins_pipe( pipe_slow );
19561 %}
19562 
19563 instruct vcmpge16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19564   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19565             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19566             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19567   match(Set dst (VectorMaskCmp src1 src2));
19568   effect(TEMP dst, TEMP scratch);
19569   format %{ "vcmpgeps  k2,$src1,$src2\n\t"
19570             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16F" %}
19571   ins_encode %{
19572     int vector_len = 2;
19573     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19574     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19575     KRegister mask = k0; // The comparison itself is not being masked.
19576     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19577     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19578   %}
19579   ins_pipe( pipe_slow );
19580 %}
19581 
19582 instruct vcmple2F(vecD dst, vecD src1, vecD src2) %{
19583   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19584             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19585             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19586   match(Set dst (VectorMaskCmp src1 src2));
19587   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed2F" %}
19588   ins_encode %{
19589     int vector_len = 0;
19590     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19591     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19592   %}
19593   ins_pipe( pipe_slow );
19594 %}
19595 
19596 instruct vcmple4F(vecX dst, vecX src1, vecX src2) %{
19597   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19598             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19599             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19600   match(Set dst (VectorMaskCmp src1 src2));
19601   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed4F" %}
19602   ins_encode %{
19603     int vector_len = 0;
19604     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19605     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19606   %}
19607   ins_pipe( pipe_slow );
19608 %}
19609 
19610 instruct vcmple8F(vecY dst, vecY src1, vecY src2) %{
19611   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19612             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19613             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19614   match(Set dst (VectorMaskCmp src1 src2));
19615   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed8F" %}
19616   ins_encode %{
19617     int vector_len = 1;
19618     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19619     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19620   %}
19621   ins_pipe( pipe_slow );
19622 %}
19623 
19624 instruct vcmple16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19625   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19626             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19627             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19628   match(Set dst (VectorMaskCmp src1 src2));
19629   effect(TEMP dst, TEMP scratch);
19630   format %{ "vcmpleps  k2,$src1,$src2\n\t"
19631             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16F" %}
19632   ins_encode %{
19633     int vector_len = 2;
19634     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19635     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19636     KRegister mask = k0; // The comparison itself is not being masked.
19637     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19638     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19639   %}
19640   ins_pipe( pipe_slow );
19641 %}
19642 
19643 instruct vcmpne2F(vecD dst, vecD src1, vecD src2) %{
19644   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19645             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19646             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19647   match(Set dst (VectorMaskCmp src1 src2));
19648   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed2F" %}
19649   ins_encode %{
19650     int vector_len = 0;
19651     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19652     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19653     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19654   %}
19655   ins_pipe( pipe_slow );
19656 %}
19657 
19658 instruct vcmpne4F(vecX dst, vecX src1, vecX src2) %{
19659   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19660             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19661             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19662   match(Set dst (VectorMaskCmp src1 src2));
19663   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed4F" %}
19664   ins_encode %{
19665     int vector_len = 0;
19666     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19667     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19668     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19669   %}
19670   ins_pipe( pipe_slow );
19671 %}
19672 
19673 instruct vcmpne8F(vecY dst, vecY src1, vecY src2) %{
19674   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19675             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19676             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19677   match(Set dst (VectorMaskCmp src1 src2));
19678   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed8F" %}
19679   ins_encode %{
19680     int vector_len = 1;
19681     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19682     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19683     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19684   %}
19685   ins_pipe( pipe_slow );
19686 %}
19687 
19688 instruct vcmpne16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19689   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19690             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19691             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19692   match(Set dst (VectorMaskCmp src1 src2));
19693   effect(TEMP dst, TEMP scratch);
19694   format %{ "vcmpneps  k2,$src1,$src2\n\t"
19695             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed16F" %}
19696   ins_encode %{
19697     int vector_len = 2;
19698     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19699     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19700     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19701     KRegister mask = k0; // The comparison itself is not being masked.
19702     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19703     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19704   %}
19705   ins_pipe( pipe_slow );
19706 %}
19707 
19708 instruct vcmpeq1D(vecD dst, vecD src1, vecD src2) %{
19709   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19710             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19711             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19712   match(Set dst (VectorMaskCmp src1 src2));
19713   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed1D" %}
19714   ins_encode %{
19715     int vector_len = 0;
19716     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19717     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19718   %}
19719   ins_pipe( pipe_slow );
19720 %}
19721 
19722 instruct vcmpeq2D(vecX dst, vecX src1, vecX src2) %{
19723   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19724             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19725             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19726   match(Set dst (VectorMaskCmp src1 src2));
19727   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed2D" %}
19728   ins_encode %{
19729     int vector_len = 0;
19730     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19731     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19732   %}
19733   ins_pipe( pipe_slow );
19734 %}
19735 
19736 instruct vcmpeq4D(vecY dst, vecY src1, vecY src2) %{
19737   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19738             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19739             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19740   match(Set dst (VectorMaskCmp src1 src2));
19741   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed4D" %}
19742   ins_encode %{
19743     int vector_len = 1;
19744     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19745     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19746   %}
19747   ins_pipe( pipe_slow );
19748 %}
19749 
19750 instruct vcmpeq8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19751   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19752             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19753             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19754   match(Set dst (VectorMaskCmp src1 src2));
19755   effect(TEMP dst, TEMP scratch);
19756   format %{ "vcmpeqpd  k2,$src1,$src2\n\t"
19757             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8D" %}
19758   ins_encode %{
19759     int vector_len = 2;
19760     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19761     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19762     KRegister mask = k0; // The comparison itself is not being masked.
19763     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19764     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19765   %}
19766   ins_pipe( pipe_slow );
19767 %}
19768 
19769 instruct vcmplt1D(vecD dst, vecD src1, vecD src2) %{
19770   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19771             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19772             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19773   match(Set dst (VectorMaskCmp src1 src2));
19774   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed1D" %}
19775   ins_encode %{
19776     int vector_len = 0;
19777     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19778     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19779   %}
19780   ins_pipe( pipe_slow );
19781 %}
19782 
19783 instruct vcmplt2D(vecX dst, vecX src1, vecX src2) %{
19784   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19785             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19786             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19787   match(Set dst (VectorMaskCmp src1 src2));
19788   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed2D" %}
19789   ins_encode %{
19790     int vector_len = 0;
19791     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19792     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19793   %}
19794   ins_pipe( pipe_slow );
19795 %}
19796 
19797 instruct vcmplt4D(vecY dst, vecY src1, vecY src2) %{
19798   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19799             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19800             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19801   match(Set dst (VectorMaskCmp src1 src2));
19802   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed4D" %}
19803   ins_encode %{
19804     int vector_len = 1;
19805     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19806     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19807   %}
19808   ins_pipe( pipe_slow );
19809 %}
19810 
19811 instruct vcmplt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19812   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19813             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19814             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19815   match(Set dst (VectorMaskCmp src1 src2));
19816   effect(TEMP dst, TEMP scratch);
19817   format %{ "vcmpltpd  k2,$src1,$src2\n\t"
19818             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed8D" %}
19819   ins_encode %{
19820     int vector_len = 2;
19821     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19822     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19823     KRegister mask = k0; // The comparison itself is not being masked.
19824     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19825     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19826   %}
19827   ins_pipe( pipe_slow );
19828 %}
19829 
19830 instruct vcmpgt1D(vecD dst, vecD src1, vecD src2) %{
19831   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19832             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19833             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19834   match(Set dst (VectorMaskCmp src1 src2));
19835   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed1D" %}
19836   ins_encode %{
19837     int vector_len = 0;
19838     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19839     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19840   %}
19841   ins_pipe( pipe_slow );
19842 %}
19843 
19844 instruct vcmpgt2D(vecX dst, vecX src1, vecX src2) %{
19845   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19846             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19847             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19848   match(Set dst (VectorMaskCmp src1 src2));
19849   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed2D" %}
19850   ins_encode %{
19851     int vector_len = 0;
19852     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19853     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19854   %}
19855   ins_pipe( pipe_slow );
19856 %}
19857 
19858 instruct vcmpgt4D(vecY dst, vecY src1, vecY src2) %{
19859   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19860             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19861             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19862   match(Set dst (VectorMaskCmp src1 src2));
19863   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed4D" %}
19864   ins_encode %{
19865     int vector_len = 1;
19866     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19867     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19868   %}
19869   ins_pipe( pipe_slow );
19870 %}
19871 
19872 instruct vcmpgt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19873   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19874             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19875             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19876   match(Set dst (VectorMaskCmp src1 src2));
19877   effect(TEMP dst, TEMP scratch);
19878   format %{ "vcmpgtpd  k2,$src1,$src2\n\t"
19879             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8D" %}
19880   ins_encode %{
19881     int vector_len = 2;
19882     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19883     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19884     KRegister mask = k0; // The comparison itself is not being masked.
19885     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19886     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19887   %}
19888   ins_pipe( pipe_slow );
19889 %}
19890 
19891 instruct vcmpge1D(vecD dst, vecD src1, vecD src2) %{
19892   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19893             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19894             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19895   match(Set dst (VectorMaskCmp src1 src2));
19896   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed1D" %}
19897   ins_encode %{
19898     int vector_len = 0;
19899     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19900     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19901   %}
19902   ins_pipe( pipe_slow );
19903 %}
19904 
19905 instruct vcmpge2D(vecX dst, vecX src1, vecX src2) %{
19906   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19907             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19908             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19909   match(Set dst (VectorMaskCmp src1 src2));
19910   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed2D" %}
19911   ins_encode %{
19912     int vector_len = 0;
19913     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19914     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19915   %}
19916   ins_pipe( pipe_slow );
19917 %}
19918 
19919 instruct vcmpge4D(vecY dst, vecY src1, vecY src2) %{
19920   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19921             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19922             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19923   match(Set dst (VectorMaskCmp src1 src2));
19924   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed4D" %}
19925   ins_encode %{
19926     int vector_len = 1;
19927     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19928     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19929   %}
19930   ins_pipe( pipe_slow );
19931 %}
19932 
19933 instruct vcmpge8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19934   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19935             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19936             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19937   match(Set dst (VectorMaskCmp src1 src2));
19938   effect(TEMP dst, TEMP scratch);
19939   format %{ "vcmpgepd  k2,$src1,$src2\n\t"
19940             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed8D" %}
19941   ins_encode %{
19942     int vector_len = 2;
19943     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19944     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19945     KRegister mask = k0; // The comparison itself is not being masked.
19946     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19947     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19948   %}
19949   ins_pipe( pipe_slow );
19950 %}
19951 
19952 instruct vcmple1D(vecD dst, vecD src1, vecD src2) %{
19953   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19954             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19955             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19956   match(Set dst (VectorMaskCmp src1 src2));
19957   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed1D" %}
19958   ins_encode %{
19959     int vector_len = 0;
19960     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19961     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19962   %}
19963   ins_pipe( pipe_slow );
19964 %}
19965 
19966 instruct vcmple2D(vecX dst, vecX src1, vecX src2) %{
19967   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19968             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19969             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19970   match(Set dst (VectorMaskCmp src1 src2));
19971   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed2D" %}
19972   ins_encode %{
19973     int vector_len = 0;
19974     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19975     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19976   %}
19977   ins_pipe( pipe_slow );
19978 %}
19979 
19980 instruct vcmple4D(vecY dst, vecY src1, vecY src2) %{
19981   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19982             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19983             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19984   match(Set dst (VectorMaskCmp src1 src2));
19985   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed4D" %}
19986   ins_encode %{
19987     int vector_len = 1;
19988     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19989     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19990   %}
19991   ins_pipe( pipe_slow );
19992 %}
19993 
19994 instruct vcmple8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19995   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19996             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19997             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19998   match(Set dst (VectorMaskCmp src1 src2));
19999   effect(TEMP dst, TEMP scratch);
20000   format %{ "vcmplepd  k2,$src1,$src2\n\t"
20001             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed8D" %}
20002   ins_encode %{
20003     int vector_len = 2;
20004     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
20005     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20006     KRegister mask = k0; // The comparison itself is not being masked.
20007     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20008     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20009   %}
20010   ins_pipe( pipe_slow );
20011 %}
20012 
20013 instruct vcmpne1D(vecD dst, vecD src1, vecD src2) %{
20014   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
20015             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20016             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
20017   match(Set dst (VectorMaskCmp src1 src2));
20018   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed1D" %}
20019   ins_encode %{
20020     int vector_len = 0;
20021     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
20022     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
20023     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20024   %}
20025   ins_pipe( pipe_slow );
20026 %}
20027 
20028 instruct vcmpne2D(vecX dst, vecX src1, vecX src2) %{
20029   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20030             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20031             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
20032   match(Set dst (VectorMaskCmp src1 src2));
20033   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed2D" %}
20034   ins_encode %{
20035     int vector_len = 0;
20036     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
20037     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
20038     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20039   %}
20040   ins_pipe( pipe_slow );
20041 %}
20042 
20043 instruct vcmpne4D(vecY dst, vecY src1, vecY src2) %{
20044   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20045             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20046             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
20047   match(Set dst (VectorMaskCmp src1 src2));
20048   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed4D" %}
20049   ins_encode %{
20050     int vector_len = 1;
20051     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
20052     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
20053     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20054   %}
20055   ins_pipe( pipe_slow );
20056 %}
20057 
20058 instruct vcmpne8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20059   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
20060             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20061             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
20062   match(Set dst (VectorMaskCmp src1 src2));
20063   effect(TEMP dst, TEMP scratch);
20064   format %{ "vcmpnepd  k2,$src1,$src2\n\t"
20065             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed8D" %}
20066   ins_encode %{
20067     int vector_len = 2;
20068     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
20069     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
20070     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20071     KRegister mask = k0; // The comparison itself is not being masked.
20072     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20073     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20074   %}
20075   ins_pipe( pipe_slow );
20076 %}
20077 
20078 instruct vcmpeq2I(vecD dst, vecD src1, vecD src2) %{
20079   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20080             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20081             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20082   match(Set dst (VectorMaskCmp src1 src2));
20083   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed2I" %}
20084   ins_encode %{
20085     int vector_len = 0;
20086     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20087   %}
20088   ins_pipe( pipe_slow );
20089 %}
20090 
20091 instruct vcmpeq4I(vecX dst, vecX src1, vecX src2) %{
20092   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20093             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20094             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20095   match(Set dst (VectorMaskCmp src1 src2));
20096   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed4I" %}
20097   ins_encode %{
20098     int vector_len = 0;
20099     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20100   %}
20101   ins_pipe( pipe_slow );
20102 %}
20103 
20104 instruct vcmpeq8I(vecY dst, vecY src1, vecY src2) %{
20105   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
20106             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20107             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20108   match(Set dst (VectorMaskCmp src1 src2));
20109   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed8I" %}
20110   ins_encode %{
20111     int vector_len = 1;
20112     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20113   %}
20114   ins_pipe( pipe_slow );
20115 %}
20116 
20117 instruct vcmpeq16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20118   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20119             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20120             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20121   match(Set dst (VectorMaskCmp src1 src2));
20122   effect(TEMP dst, TEMP scratch);
20123   format %{ "vpcmpeqd  k2,$src1,$src2\n\t"
20124             "vmovdqu32 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16I" %}
20125   ins_encode %{
20126     int vector_len = 2;
20127     Assembler::ComparisonPredicate cmp = Assembler::eq;
20128     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20129     KRegister mask = k0; // The comparison itself is not being masked.
20130     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20131     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20132   %}
20133   ins_pipe( pipe_slow );
20134 %}
20135 
20136 instruct vcmplt2I(vecD dst, vecD src1, vecD src2) %{
20137   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20138             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20139             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20140   match(Set dst (VectorMaskCmp src1 src2));
20141   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed2I" %}
20142   ins_encode %{
20143     int vector_len = 0;
20144     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20145   %}
20146   ins_pipe( pipe_slow );
20147 %}
20148 
20149 instruct vcmplt4I(vecX dst, vecX src1, vecX src2) %{
20150   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20151             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20152             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20153   match(Set dst (VectorMaskCmp src1 src2));
20154   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed4I" %}
20155   ins_encode %{
20156     int vector_len = 0;
20157     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20158   %}
20159   ins_pipe( pipe_slow );
20160 %}
20161 
20162 instruct vcmplt8I(vecY dst, vecY src1, vecY src2) %{
20163   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
20164             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20165             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20166   match(Set dst (VectorMaskCmp src1 src2));
20167   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed8I" %}
20168   ins_encode %{
20169     int vector_len = 1;
20170     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20171   %}
20172   ins_pipe( pipe_slow );
20173 %}
20174 
20175 instruct vcmplt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20176   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20177             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20178             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20179   match(Set dst (VectorMaskCmp src1 src2));
20180   effect(TEMP dst, TEMP scratch);
20181   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
20182             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
20183   ins_encode %{
20184     int vector_len = 2;
20185     Assembler::ComparisonPredicate cmp = Assembler::lt;
20186     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20187     KRegister mask = k0; // The comparison itself is not being masked.
20188     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20189     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20190   %}
20191   ins_pipe( pipe_slow );
20192 %}
20193 
20194 instruct vcmpgt2I(vecD dst, vecD src1, vecD src2) %{
20195   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20196             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20197             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20198   match(Set dst (VectorMaskCmp src1 src2));
20199   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed2I" %}
20200   ins_encode %{
20201     int vector_len = 0;
20202     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20203   %}
20204   ins_pipe( pipe_slow );
20205 %}
20206 
20207 instruct vcmpgt4I(vecX dst, vecX src1, vecX src2) %{
20208   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20209             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20210             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20211   match(Set dst (VectorMaskCmp src1 src2));
20212   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed4I" %}
20213   ins_encode %{
20214     int vector_len = 0;
20215     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20216   %}
20217   ins_pipe( pipe_slow );
20218 %}
20219 
20220 instruct vcmpgt8I(vecY dst, vecY src1, vecY src2) %{
20221   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
20222             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20223             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20224   match(Set dst (VectorMaskCmp src1 src2));
20225   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed8I" %}
20226   ins_encode %{
20227     int vector_len = 1;
20228     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20229   %}
20230   ins_pipe( pipe_slow );
20231 %}
20232 
20233 instruct vcmpgt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20234   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20235             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20236             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20237   match(Set dst (VectorMaskCmp src1 src2));
20238   effect(TEMP dst, TEMP scratch);
20239   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
20240             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
20241   ins_encode %{
20242     int vector_len = 2;
20243     Assembler::ComparisonPredicate cmp = Assembler::nle;
20244     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20245     KRegister mask = k0; // The comparison itself is not being masked.
20246     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20247     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20248   %}
20249   ins_pipe( pipe_slow );
20250 %}
20251 
20252 instruct vcmpge2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20253   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20254             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20255             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20256   match(Set dst (VectorMaskCmp src1 src2));
20257   effect(TEMP scratch);
20258   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
20259             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed2I" %}
20260   ins_encode %{
20261     int vector_len = 0;
20262     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20263     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20264   %}
20265   ins_pipe( pipe_slow );
20266 %}
20267 
20268 instruct vcmpge4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20269   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20270             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20271             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20272   match(Set dst (VectorMaskCmp src1 src2));
20273   effect(TEMP scratch);
20274   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
20275             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4I" %}
20276   ins_encode %{
20277     int vector_len = 0;
20278     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20279     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20280   %}
20281   ins_pipe( pipe_slow );
20282 %}
20283 
20284 instruct vcmpge8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20285   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
20286             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20287             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20288   match(Set dst (VectorMaskCmp src1 src2));
20289   effect(TEMP scratch);
20290   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
20291             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8I" %}
20292   ins_encode %{
20293     int vector_len = 1;
20294     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20295     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20296   %}
20297   ins_pipe( pipe_slow );
20298 %}
20299 
20300 instruct vcmpge16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20301   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20302             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20303             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20304   match(Set dst (VectorMaskCmp src1 src2));
20305   effect(TEMP dst, TEMP scratch);
20306   format %{ "vpcmpnltd  k2,$src1,$src2\n\t"
20307             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16I" %}
20308   ins_encode %{
20309     int vector_len = 2;
20310     Assembler::ComparisonPredicate cmp = Assembler::nlt;
20311     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20312     KRegister mask = k0; // The comparison itself is not being masked.
20313     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20314     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20315   %}
20316   ins_pipe( pipe_slow );
20317 %}
20318 
20319 instruct vcmple2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20320   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20321             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20322             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20323   match(Set dst (VectorMaskCmp src1 src2));
20324   effect(TEMP scratch);
20325   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
20326             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed2I" %}
20327   ins_encode %{
20328     int vector_len = 0;
20329     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20330     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20331   %}
20332   ins_pipe( pipe_slow );
20333 %}
20334 
20335 instruct vcmple4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20336   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20337             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20338             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20339   match(Set dst (VectorMaskCmp src1 src2));
20340   effect(TEMP scratch);
20341   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
20342             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4I" %}
20343   ins_encode %{
20344     int vector_len = 0;
20345     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20346     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20347   %}
20348   ins_pipe( pipe_slow );
20349 %}
20350 
20351 instruct vcmple8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20352   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
20353             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20354             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20355   match(Set dst (VectorMaskCmp src1 src2));
20356   effect(TEMP scratch);
20357   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
20358             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8I" %}
20359   ins_encode %{
20360     int vector_len = 1;
20361     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20362     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20363   %}
20364   ins_pipe( pipe_slow );
20365 %}
20366 
20367 instruct vcmple16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20368   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20369             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20370             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20371   match(Set dst (VectorMaskCmp src1 src2));
20372   effect(TEMP dst, TEMP scratch);
20373   format %{ "vpcmpled  k2,$src1,$src2\n\t"
20374             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16I" %}
20375   ins_encode %{
20376     int vector_len = 2;
20377     Assembler::ComparisonPredicate cmp = Assembler::le;
20378     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20379     KRegister mask = k0; // The comparison itself is not being masked.
20380     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20381     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20382   %}
20383   ins_pipe( pipe_slow );
20384 %}
20385 
20386 instruct vcmpne2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20387   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
20388             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20389             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20390   match(Set dst (VectorMaskCmp src1 src2));
20391   effect(TEMP scratch);
20392   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
20393             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed2I" %}
20394   ins_encode %{
20395     int vector_len = 0;
20396     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20397     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20398   %}
20399   ins_pipe( pipe_slow );
20400 %}
20401 
20402 instruct vcmpne4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20403   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20404             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20405             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20406   match(Set dst (VectorMaskCmp src1 src2));
20407   effect(TEMP scratch);
20408   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
20409             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4I" %}
20410   ins_encode %{
20411     int vector_len = 0;
20412     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20413     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20414   %}
20415   ins_pipe( pipe_slow );
20416 %}
20417 
20418 instruct vcmpne8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20419   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
20420             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20421             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20422   match(Set dst (VectorMaskCmp src1 src2));
20423   effect(TEMP scratch);
20424   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
20425             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8I" %}
20426   ins_encode %{
20427     int vector_len = 1;
20428     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20429     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20430   %}
20431   ins_pipe( pipe_slow );
20432 %}
20433 
20434 instruct vcmpne16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20435   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20436             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20437             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20438   match(Set dst (VectorMaskCmp src1 src2));
20439   effect(TEMP dst, TEMP scratch);
20440   format %{ "vpcmpneqd  k2,$src1,$src2\n\t"
20441             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed16I" %}
20442   ins_encode %{
20443     int vector_len = 2;
20444     Assembler::ComparisonPredicate cmp = Assembler::neq;
20445     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20446     KRegister mask = k0; // The comparison itself is not being masked.
20447     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20448     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20449   %}
20450   ins_pipe( pipe_slow );
20451 %}
20452 
20453 instruct vcmpeq8B(vecD dst, vecD src1, vecD src2) %{
20454   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20455             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20456             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20457   match(Set dst (VectorMaskCmp src1 src2));
20458   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed8B" %}
20459   ins_encode %{
20460     int vector_len = 0;
20461     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20462   %}
20463   ins_pipe( pipe_slow );
20464 %}
20465 
20466 instruct vcmpeq16B(vecX dst, vecX src1, vecX src2) %{
20467   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20468             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20469             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20470   match(Set dst (VectorMaskCmp src1 src2));
20471   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed16B" %}
20472   ins_encode %{
20473     int vector_len = 0;
20474     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20475   %}
20476   ins_pipe( pipe_slow );
20477 %}
20478 
20479 instruct vcmpeq32B(vecY dst, vecY src1, vecY src2) %{
20480   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20481             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20482             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20483   match(Set dst (VectorMaskCmp src1 src2));
20484   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed32B" %}
20485   ins_encode %{
20486     int vector_len = 1;
20487     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20488   %}
20489   ins_pipe( pipe_slow );
20490 %}
20491 
20492 instruct vcmpeq64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20493   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20494             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20495             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20496   match(Set dst (VectorMaskCmp src1 src2));
20497   effect(TEMP dst, TEMP scratch);
20498   format %{ "vpcmpeqb  k2,$src1,$src2\n\t"
20499             "vmovdqu8 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed64B" %}
20500   ins_encode %{
20501     int vector_len = 2;
20502     Assembler::ComparisonPredicate cmp = Assembler::eq;
20503     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20504     KRegister mask = k0; // The comparison itself is not being masked.
20505     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20506     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20507   %}
20508   ins_pipe( pipe_slow );
20509 %}
20510 
20511 instruct vcmplt8B(vecD dst, vecD src1, vecD src2) %{
20512   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20513             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20514             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20515   match(Set dst (VectorMaskCmp src1 src2));
20516   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed8B" %}
20517   ins_encode %{
20518     int vector_len = 0;
20519     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20520   %}
20521   ins_pipe( pipe_slow );
20522 %}
20523 
20524 instruct vcmplt16B(vecX dst, vecX src1, vecX src2) %{
20525   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20526             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20527             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20528   match(Set dst (VectorMaskCmp src1 src2));
20529   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed16B" %}
20530   ins_encode %{
20531     int vector_len = 0;
20532     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20533   %}
20534   ins_pipe( pipe_slow );
20535 %}
20536 
20537 instruct vcmplt32B(vecY dst, vecY src1, vecY src2) %{
20538   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20539             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20540             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20541   match(Set dst (VectorMaskCmp src1 src2));
20542   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed32B" %}
20543   ins_encode %{
20544     int vector_len = 1;
20545     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20546   %}
20547   ins_pipe( pipe_slow );
20548 %}
20549 
20550 instruct vcmplt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20551   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20552             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20553             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20554   match(Set dst (VectorMaskCmp src1 src2));
20555   effect(TEMP dst, TEMP scratch);
20556   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
20557             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
20558   ins_encode %{
20559     int vector_len = 2;
20560     Assembler::ComparisonPredicate cmp = Assembler::lt;
20561     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20562     KRegister mask = k0; // The comparison itself is not being masked.
20563     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20564     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20565   %}
20566   ins_pipe( pipe_slow );
20567 %}
20568 
20569 instruct vcmpgt8B(vecD dst, vecD src1, vecD src2) %{
20570   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20571             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20572             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20573   match(Set dst (VectorMaskCmp src1 src2));
20574   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed8B" %}
20575   ins_encode %{
20576     int vector_len = 0;
20577     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20578   %}
20579   ins_pipe( pipe_slow );
20580 %}
20581 
20582 instruct vcmpgt16B(vecX dst, vecX src1, vecX src2) %{
20583   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20584             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20585             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20586   match(Set dst (VectorMaskCmp src1 src2));
20587   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed16B" %}
20588   ins_encode %{
20589     int vector_len = 0;
20590     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20591   %}
20592   ins_pipe( pipe_slow );
20593 %}
20594 
20595 instruct vcmpgt32B(vecY dst, vecY src1, vecY src2) %{
20596   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20597             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20598             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20599   match(Set dst (VectorMaskCmp src1 src2));
20600   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed32B" %}
20601   ins_encode %{
20602     int vector_len = 1;
20603     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20604   %}
20605   ins_pipe( pipe_slow );
20606 %}
20607 
20608 instruct vcmpgt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20609   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20610             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20611             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20612   match(Set dst (VectorMaskCmp src1 src2));
20613   effect(TEMP dst, TEMP scratch);
20614   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
20615             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
20616   ins_encode %{
20617     int vector_len = 2;
20618     Assembler::ComparisonPredicate cmp = Assembler::nle;
20619     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20620     KRegister mask = k0; // The comparison itself is not being masked.
20621     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20622     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20623   %}
20624   ins_pipe( pipe_slow );
20625 %}
20626 
20627 instruct vcmpge8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20628   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20629             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20630             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20631   match(Set dst (VectorMaskCmp src1 src2));
20632   effect(TEMP scratch);
20633   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
20634             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8B" %}
20635   ins_encode %{
20636     int vector_len = 0;
20637     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20638     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20639   %}
20640   ins_pipe( pipe_slow );
20641 %}
20642 
20643 instruct vcmpge16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20644   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20645             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20646             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20647   match(Set dst (VectorMaskCmp src1 src2));
20648   effect(TEMP scratch);
20649   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
20650             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16B" %}
20651   ins_encode %{
20652     int vector_len = 0;
20653     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20654     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20655   %}
20656   ins_pipe( pipe_slow );
20657 %}
20658 
20659 instruct extract8d(regD dst, vecZ src, vecZ tmp, immI idx) %{
20660   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20661   match(Set dst (ExtractD src idx));
20662   effect(TEMP tmp);
20663   ins_encode %{
20664     int vector_len = 2;
20665     int midx = 0x7 & $idx$$constant;
20666     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20667       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20668     } else if (midx == 1) {
20669       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20670     } else if (midx > 1 && midx <= 7) {
20671       int extr_idx1 = midx / 2;
20672       int extr_idx2 = midx % 2;
20673       __ vextractf32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20674       __ vpshufpd($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, extr_idx2, vector_len);
20675   }
20676   %}
20677   ins_pipe( pipe_slow );
20678 %}
20679 
20680 instruct extract4d(regD dst, vecY src, vecY tmp, immI idx) %{
20681   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20682   match(Set dst (ExtractD src idx));
20683   effect(TEMP tmp);
20684   ins_encode %{
20685     int vector_len = 1;
20686     int midx = 0x3 & $idx$$constant;
20687     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20688       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20689     } else if (midx == 1) {
20690       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20691     } else if (midx > 1 && midx <= 3) {
20692       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20693       __ vpshufpd($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, midx - 2, vector_len);
20694     }
20695        
20696   %}
20697   ins_pipe( pipe_slow );
20698 %}
20699 
20700 instruct extract2d(regD dst, vecX src, immI idx) %{
20701   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20702   match(Set dst (ExtractD src idx));
20703   ins_encode %{
20704     int vector_len = 0;
20705     int midx = 0x1 & $idx$$constant;
20706     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20707       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20708     } else if (midx >=1) {
20709       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20710     } 
20711   %}
20712   ins_pipe( pipe_slow );
20713 %}
20714 
20715 instruct extract1d(regD dst, vecD src, immI idx) %{
20716   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 1);
20717   match(Set dst (ExtractD src idx));
20718   ins_encode %{
20719     int vector_len = 0;
20720     int midx = 0x1 & $idx$$constant;
20721     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20722       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20723     } 
20724   %}
20725   ins_pipe( pipe_slow );
20726 %}
20727 
20728 instruct extract16f(regF dst, vecZ src, vecZ tmp, immI idx) %{
20729   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20730   match(Set dst (ExtractF src idx));
20731   effect(TEMP tmp);
20732   ins_encode %{
20733     int vector_len=2; 
20734     int midx = 0xF & $idx$$constant;
20735     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20736       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20737     } else if (midx >= 1 && midx <= 3) {
20738       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20739     } else {
20740       int extr_idx1 = midx / 4;
20741       int extr_idx2 = midx % 4;
20742       __ vextractf32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20743       __ vpshufps($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, extr_idx2, vector_len);
20744   }
20745   %}
20746   ins_pipe( pipe_slow );
20747 %}
20748 
20749 instruct extract8f(regF dst, vecY src, vecY tmp, immI idx) %{
20750   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20751   match(Set dst (ExtractF src idx));
20752   effect(TEMP tmp);
20753   ins_encode %{
20754     int vector_len=1; 
20755     int midx = 0x7 & $idx$$constant;
20756     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20757       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20758     } else if (midx >= 1 && midx <= 3) {
20759       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20760     } else if (midx >= 4) {
20761       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20762       __ vpshufps($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, midx - 4, vector_len);
20763     }
20764   %}
20765   ins_pipe( pipe_slow );
20766 %}
20767 
20768 instruct extract4f(regF dst, vecX src, immI idx) %{
20769   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20770   match(Set dst (ExtractF src idx));
20771   ins_encode %{
20772     int vector_len=0;
20773     int midx = 0x3 & $idx$$constant;
20774     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20775       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20776     } else if (midx >= 1 && midx <= 3) {
20777       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20778     } 
20779   %}
20780   ins_pipe( pipe_slow );
20781 %}
20782 
20783 instruct extract2f(regF dst, vecD src, immI idx) %{
20784   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20785   match(Set dst (ExtractF src idx));
20786   ins_encode %{
20787     int vector_len=0;
20788     int midx = 0x1 & $idx$$constant;
20789     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20790       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20791     } else {
20792       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);  
20793     } 
20794   %}
20795   ins_pipe( pipe_slow );
20796 %}
20797  
20798 instruct extract8l(rRegL dst, vecZ src, vecZ tmp, immI idx) %{
20799   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20800   match(Set dst (ExtractL src idx));
20801   effect(TEMP tmp);
20802   ins_encode %{
20803     int midx = 0x7 & $idx$$constant;
20804     if (midx == 0) {
20805       __ movdl($dst$$Register, $src$$XMMRegister);
20806     } else if (midx >= 1 && midx <= 3) {
20807       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20808     } 
20809    else if (midx >= 4 && midx <= 7) {
20810     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1); 
20811       __ pextrq($dst$$Register, $tmp$$XMMRegister, midx-4);
20812     } 
20813   %}
20814   ins_pipe( pipe_slow );
20815 %}
20816 
20817 instruct extract4l(rRegL dst, vecY src, immI idx) %{
20818   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20819   match(Set dst (ExtractL src idx));
20820   ins_encode %{
20821     int midx = 0x3 & $idx$$constant;
20822     if (midx == 0) {
20823       __ movdl($dst$$Register, $src$$XMMRegister);
20824     } else if (midx >= 1 && midx <= 3) {
20825       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20826     } 
20827   %}
20828   ins_pipe( pipe_slow );
20829 %}
20830 
20831 instruct extract2l(rRegL dst, vecX src, immI idx) %{
20832   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20833   match(Set dst (ExtractL src idx));
20834   ins_encode %{
20835     int midx = 0x1 & $idx$$constant;
20836     if (midx == 0) {
20837       __ movdl($dst$$Register, $src$$XMMRegister);
20838     } else if (midx >= 1) {
20839       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20840     } 
20841   %}
20842   ins_pipe( pipe_slow );
20843 %}
20844 
20845 instruct extract1l(rRegL dst, vecD src, immI idx) %{
20846   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 1);
20847   match(Set dst (ExtractL src idx));
20848   ins_encode %{
20849     int midx = 0x1 & $idx$$constant;
20850     if (midx == 0) {
20851       __ movdl($dst$$Register, $src$$XMMRegister);
20852     } 
20853   %}
20854   ins_pipe( pipe_slow );
20855 %}
20856 
20857 instruct extract16i(rRegI dst, vecZ src, vecZ tmp, immI idx) %{
20858   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20859   match(Set dst (ExtractI src idx));
20860   effect(TEMP tmp);
20861     ins_encode %{
20862     int midx = 0xF & $idx$$constant;
20863     if (midx == 0) {
20864       __ movdl($dst$$Register, $src$$XMMRegister);
20865     }
20866     else if (midx >= 1 && midx <= 3) {
20867       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20868     } 
20869     else {
20870       // Using 4 because there are 4 ints in 128-bit
20871       int extr_idx1 = midx / 4;
20872       int extr_idx2 = midx % 4;
20873       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20874       __ pextrd($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20875     }
20876   %}
20877   ins_pipe( pipe_slow );
20878 %}
20879  
20880 instruct extract8i(rRegI dst, vecY src, vecY tmp, immI idx) %{
20881   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20882   match(Set dst (ExtractI src idx));
20883   effect(TEMP tmp);
20884   ins_encode %{
20885     int midx = 0x7 & $idx$$constant;
20886     if (midx == 0) {
20887       __ movdl($dst$$Register, $src$$XMMRegister);
20888     } else if (midx >= 1 && midx <= 3) {
20889       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20890     } else if (midx >= 4) {
20891       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20892       __ pextrd($dst$$Register, $tmp$$XMMRegister, midx - 4);
20893     }
20894   %}
20895   ins_pipe( pipe_slow );
20896 %}
20897 
20898 instruct extract4i(rRegI dst, vecX src, immI idx) %{
20899   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20900   match(Set dst (ExtractI src idx));
20901   ins_encode %{
20902     int midx = 0x3 & $idx$$constant;
20903     if (midx == 0) {
20904       __ movdl($dst$$Register, $src$$XMMRegister);
20905     } else if (midx >= 1 && midx <= 3) {
20906       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20907     } 
20908   %}
20909   ins_pipe( pipe_slow );
20910 %}
20911 
20912 instruct extract2i(rRegI dst, vecD src, immI idx) %{
20913   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20914   match(Set dst (ExtractI src idx));
20915   ins_encode %{
20916     int midx = 0x1 & $idx$$constant;
20917     if (midx == 0) {
20918       __ movdl($dst$$Register, $src$$XMMRegister);
20919     } else if (midx >= 1) {
20920       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20921     } 
20922   %}
20923   ins_pipe( pipe_slow );
20924 %}
20925  
20926 instruct extract32s(rRegI dst, vecZ src, vecZ tmp, immI idx) %{
20927   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 32);
20928   match(Set dst (ExtractS src idx));
20929   effect(TEMP tmp);
20930     ins_encode %{
20931     int midx = 0x1F & $idx$$constant;
20932     if (midx == 0) {
20933       __ movdl($dst$$Register, $src$$XMMRegister);
20934     __ movswl($dst$$Register, $dst$$Register);
20935     }
20936     else if (midx >= 1 && midx <= 7) {
20937       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20938     __ movswl($dst$$Register, $dst$$Register);
20939     } 
20940     else {
20941       int extr_idx1 = midx / 8;
20942       int extr_idx2 = midx % 8;
20943       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20944       __ pextrw($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20945     __ movswl($dst$$Register, $dst$$Register);
20946     }
20947   %}
20948   ins_pipe( pipe_slow );
20949 %}
20950 
20951 instruct extract16s(rRegI dst, vecY src, vecY tmp, immI idx) %{
20952   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20953   match(Set dst (ExtractS src idx));
20954   effect(TEMP tmp);
20955   ins_encode %{
20956     int midx = 0xF & $idx$$constant;
20957     if (midx == 0) {
20958       __ movdl($dst$$Register, $src$$XMMRegister);
20959       __ movswl($dst$$Register, $dst$$Register);
20960     } else if (midx >= 1 && midx <= 7) {
20961       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20962       __ movswl($dst$$Register, $dst$$Register);
20963     }
20964   else if (midx >= 8 && midx <= 15) {
20965     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20966     __ pextrw($dst$$Register, $tmp$$XMMRegister, midx-8);
20967       __ movswl($dst$$Register, $dst$$Register);
20968   }
20969   %}
20970   ins_pipe( pipe_slow );
20971 %}
20972  
20973 instruct extract8s(rRegI dst, vecX src, immI idx) %{
20974   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20975   match(Set dst (ExtractS src idx));
20976   ins_encode %{
20977     int midx = 0x7 & $idx$$constant;
20978     if (midx == 0) {
20979       __ movdl($dst$$Register, $src$$XMMRegister);
20980       __ movswl($dst$$Register, $dst$$Register);
20981     } else if (midx >= 1) {
20982       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20983       __ movswl($dst$$Register, $dst$$Register);
20984     }
20985   %}
20986   ins_pipe( pipe_slow );
20987 %}
20988 
20989 instruct extract4s(rRegI dst, vecD src, immI idx) %{
20990   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20991   match(Set dst (ExtractS src idx));
20992   ins_encode %{
20993     int midx = 0x3 & $idx$$constant;
20994     if (midx == 0) {
20995       __ movdl($dst$$Register, $src$$XMMRegister);
20996       __ movswl($dst$$Register, $dst$$Register);
20997     } else if (midx >= 1) {
20998       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20999       __ movswl($dst$$Register, $dst$$Register);
21000     }
21001   %}
21002   ins_pipe( pipe_slow );
21003 %}
21004  
21005 instruct extract64b(rRegI dst, vecZ src, vecZ tmp, immI idx) %{
21006   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 64);
21007   match(Set dst (ExtractB src idx));
21008   effect(TEMP tmp);
21009     ins_encode %{
21010     int midx = 0x3F & $idx$$constant;
21011     if (midx == 0) {
21012       __ movdl($dst$$Register, $src$$XMMRegister);
21013     __ movsbl($dst$$Register, $dst$$Register);
21014     }
21015     else if (midx >= 1 && midx <= 15) {
21016       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
21017     __ movsbl($dst$$Register, $dst$$Register);
21018     } 
21019     else {
21020       int extr_idx1 = midx / 16;
21021       int extr_idx2 = midx % 16;
21022       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
21023       __ pextrb($dst$$Register, $tmp$$XMMRegister, extr_idx2);
21024     __ movsbl($dst$$Register, $dst$$Register);
21025     }
21026   %}
21027   ins_pipe( pipe_slow );
21028 %}
21029 
21030 instruct extract32b(rRegI dst, vecY src, vecY tmp, immI idx) %{
21031   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 32);
21032   match(Set dst (ExtractB src idx));
21033   effect(TEMP tmp);
21034     ins_encode %{
21035     int midx = 0x1F & $idx$$constant;
21036     if (midx == 0) {
21037       __ movdl($dst$$Register, $src$$XMMRegister);
21038     __ movsbl($dst$$Register, $dst$$Register);
21039     }
21040     else if (midx >= 1 && midx <= 15) {
21041       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
21042     __ movsbl($dst$$Register, $dst$$Register);
21043     } 
21044     else {
21045       int extr_idx1 = midx / 16;
21046       int extr_idx2 = midx % 16;
21047       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
21048       __ pextrb($dst$$Register, $tmp$$XMMRegister, extr_idx2);
21049     __ movsbl($dst$$Register, $dst$$Register);
21050     }
21051   %}
21052   ins_pipe( pipe_slow );
21053 %}
21054 
21055 instruct extract16b(rRegI dst, vecX src, immI idx) %{
21056   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 16);
21057   match(Set dst (ExtractB src idx));
21058   ins_encode %{
21059     int midx = 0xF & $idx$$constant;
21060     if (midx == 0) {
21061       __ movdl($dst$$Register, $src$$XMMRegister);
21062       __ movsbl($dst$$Register, $dst$$Register);
21063     } else if (midx >= 1) {
21064       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
21065       __ movsbl($dst$$Register, $dst$$Register);
21066     }
21067   %}
21068   ins_pipe( pipe_slow );
21069 %}
21070 
21071 instruct extract8b(rRegI dst, vecD src, immI idx) %{
21072   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 8);
21073   match(Set dst (ExtractB src idx));
21074   ins_encode %{
21075     int midx = 0x7 & $idx$$constant;
21076     if (midx == 0) {
21077       __ movdl($dst$$Register, $src$$XMMRegister);
21078       __ movsbl($dst$$Register, $dst$$Register);
21079     } else if (midx >= 1) {
21080       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
21081       __ movsbl($dst$$Register, $dst$$Register);
21082     }
21083   %}
21084   ins_pipe( pipe_slow );
21085 %}
21086 
21087 instruct vcmpge32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21088   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
21089             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21090             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21091   match(Set dst (VectorMaskCmp src1 src2));
21092   effect(TEMP scratch);
21093   format %{ "vpcmpgtb  $dst,$src2,$src1\n  "
21094             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed32B" %}
21095   ins_encode %{
21096     int vector_len = 1;
21097     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21098     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21099   %}
21100   ins_pipe( pipe_slow );
21101 %}
21102 
21103 instruct vcmpge64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21104   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
21105             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21106             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21107   match(Set dst (VectorMaskCmp src1 src2));
21108   effect(TEMP dst, TEMP scratch);
21109   format %{ "vpcmpnltb  k2,$src1,$src2\n\t"
21110             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed64B" %}
21111   ins_encode %{
21112     int vector_len = 2;
21113     Assembler::ComparisonPredicate cmp = Assembler::nlt;
21114     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21115     KRegister mask = k0; // The comparison itself is not being masked.
21116     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21117     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21118   %}
21119   ins_pipe( pipe_slow );
21120 %}
21121 
21122 instruct vcmple8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21123   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21124             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21125             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21126   match(Set dst (VectorMaskCmp src1 src2));
21127   effect(TEMP scratch);
21128   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
21129             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8B" %}
21130   ins_encode %{
21131     int vector_len = 0;
21132     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21133     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21134   %}
21135   ins_pipe( pipe_slow );
21136 %}
21137 
21138 instruct vcmple16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21139   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
21140             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21141             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21142   match(Set dst (VectorMaskCmp src1 src2));
21143   effect(TEMP scratch);
21144   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
21145             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16B" %}
21146   ins_encode %{
21147     int vector_len = 0;
21148     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21149     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21150   %}
21151   ins_pipe( pipe_slow );
21152 %}
21153 
21154 instruct vcmple32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21155   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
21156             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21157             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21158   match(Set dst (VectorMaskCmp src1 src2));
21159   effect(TEMP scratch);
21160   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
21161             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed32B" %}
21162   ins_encode %{
21163     int vector_len = 1;
21164     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21165     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21166   %}
21167   ins_pipe( pipe_slow );
21168 %}
21169 
21170 instruct vcmple64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21171   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
21172             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21173             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21174   match(Set dst (VectorMaskCmp src1 src2));
21175   effect(TEMP dst, TEMP scratch);
21176   format %{ "vpcmpleb  k2,$src1,$src2\n\t"
21177             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed64B" %}
21178   ins_encode %{
21179     int vector_len = 2;
21180     Assembler::ComparisonPredicate cmp = Assembler::le;
21181     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21182     KRegister mask = k0; // The comparison itself is not being masked.
21183     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21184     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21185   %}
21186   ins_pipe( pipe_slow );
21187 %}
21188 
21189 instruct vcmpne8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21190   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21191             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21192             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21193   match(Set dst (VectorMaskCmp src1 src2));
21194   effect(TEMP scratch);
21195   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
21196             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8B" %}
21197   ins_encode %{
21198     int vector_len = 0;
21199     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21200     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21201   %}
21202   ins_pipe( pipe_slow );
21203 %}
21204 
21205 instruct vcmpne16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21206   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
21207             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21208             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21209   match(Set dst (VectorMaskCmp src1 src2));
21210   effect(TEMP scratch);
21211   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
21212             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16B" %}
21213   ins_encode %{
21214     int vector_len = 0;
21215     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21216     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21217   %}
21218   ins_pipe( pipe_slow );
21219 %}
21220 
21221 instruct vcmpne32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21222   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
21223             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21224             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21225   match(Set dst (VectorMaskCmp src1 src2));
21226   effect(TEMP scratch);
21227   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
21228             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed32B" %}
21229   ins_encode %{
21230     int vector_len = 1;
21231     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21232     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21233   %}
21234   ins_pipe( pipe_slow );
21235 %}
21236 
21237 instruct vcmpne64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21238   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
21239             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21240             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
21241   match(Set dst (VectorMaskCmp src1 src2));
21242   effect(TEMP dst, TEMP scratch);
21243   format %{ "vpcmpneqb  k2,$src1,$src2\n\t"
21244             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed64B" %}
21245   ins_encode %{
21246     int vector_len = 2;
21247     Assembler::ComparisonPredicate cmp = Assembler::neq;
21248     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21249     KRegister mask = k0; // The comparison itself is not being masked.
21250     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21251     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21252   %}
21253   ins_pipe( pipe_slow );
21254 %}
21255 
21256 instruct vcmpeq4S(vecD dst, vecD src1, vecD src2) %{
21257   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21258             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21259             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21260   match(Set dst (VectorMaskCmp src1 src2));
21261   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed4S" %}
21262   ins_encode %{
21263     int vector_len = 0;
21264     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21265   %}
21266   ins_pipe( pipe_slow );
21267 %}
21268 
21269 instruct vcmpeq8S(vecX dst, vecX src1, vecX src2) %{
21270   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21271             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21272             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21273   match(Set dst (VectorMaskCmp src1 src2));
21274   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed8S" %}
21275   ins_encode %{
21276     int vector_len = 0;
21277     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21278   %}
21279   ins_pipe( pipe_slow );
21280 %}
21281 
21282 instruct vcmpeq16S(vecY dst, vecY src1, vecY src2) %{
21283   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21284             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21285             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21286   match(Set dst (VectorMaskCmp src1 src2));
21287   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed16S" %}
21288   ins_encode %{
21289     int vector_len = 1;
21290     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21291   %}
21292   ins_pipe( pipe_slow );
21293 %}
21294 
21295 instruct vcmpeq32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21296   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21297             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21298             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21299   match(Set dst (VectorMaskCmp src1 src2));
21300   effect(TEMP dst, TEMP scratch);
21301   format %{ "vpcmpeqw  k2,$src1,$src2\n\t"
21302             "vmovdqu16 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed32S" %}
21303   ins_encode %{
21304     int vector_len = 2;
21305     Assembler::ComparisonPredicate cmp = Assembler::eq;
21306     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21307     KRegister mask = k0; // The comparison itself is not being masked.
21308     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21309     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21310   %}
21311   ins_pipe( pipe_slow );
21312 %}
21313 
21314 instruct vcmplt4S(vecD dst, vecD src1, vecD src2) %{
21315   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21316             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21317             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21318   match(Set dst (VectorMaskCmp src1 src2));
21319   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed4S" %}
21320   ins_encode %{
21321     int vector_len = 0;
21322     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21323   %}
21324   ins_pipe( pipe_slow );
21325 %}
21326 
21327 instruct vcmplt8S(vecX dst, vecX src1, vecX src2) %{
21328   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21329             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21330             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21331   match(Set dst (VectorMaskCmp src1 src2));
21332   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed8S" %}
21333   ins_encode %{
21334     int vector_len = 0;
21335     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21336   %}
21337   ins_pipe( pipe_slow );
21338 %}
21339 
21340 instruct vcmplt16S(vecY dst, vecY src1, vecY src2) %{
21341   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21342             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21343             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21344   match(Set dst (VectorMaskCmp src1 src2));
21345   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed16S" %}
21346   ins_encode %{
21347     int vector_len = 1;
21348     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21349   %}
21350   ins_pipe( pipe_slow );
21351 %}
21352 
21353 instruct vcmplt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21354   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21355             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21356             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21357   match(Set dst (VectorMaskCmp src1 src2));
21358   effect(TEMP dst, TEMP scratch);
21359   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
21360             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
21361   ins_encode %{
21362     int vector_len = 2;
21363     Assembler::ComparisonPredicate cmp = Assembler::lt;
21364     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21365     KRegister mask = k0; // The comparison itself is not being masked.
21366     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21367     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21368   %}
21369   ins_pipe( pipe_slow );
21370 %}
21371 
21372 instruct vcmpgt4S(vecD dst, vecD src1, vecD src2) %{
21373   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21374             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21375             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21376   match(Set dst (VectorMaskCmp src1 src2));
21377   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed4S" %}
21378   ins_encode %{
21379     int vector_len = 0;
21380     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21381   %}
21382   ins_pipe( pipe_slow );
21383 %}
21384 
21385 instruct vcmpgt8S(vecX dst, vecX src1, vecX src2) %{
21386   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21387             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21388             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21389   match(Set dst (VectorMaskCmp src1 src2));
21390   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed8S" %}
21391   ins_encode %{
21392     int vector_len = 0;
21393     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21394   %}
21395   ins_pipe( pipe_slow );
21396 %}
21397 
21398 instruct vcmpgt16S(vecY dst, vecY src1, vecY src2) %{
21399   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21400             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21401             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21402   match(Set dst (VectorMaskCmp src1 src2));
21403   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed16S" %}
21404   ins_encode %{
21405     int vector_len = 1;
21406     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21407   %}
21408   ins_pipe( pipe_slow );
21409 %}
21410 
21411 instruct vcmpgt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21412   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21413             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21414             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21415   match(Set dst (VectorMaskCmp src1 src2));
21416   effect(TEMP dst, TEMP scratch);
21417   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
21418             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
21419   ins_encode %{
21420     int vector_len = 2;
21421     Assembler::ComparisonPredicate cmp = Assembler::nle;
21422     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21423     KRegister mask = k0; // The comparison itself is not being masked.
21424     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21425     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21426   %}
21427   ins_pipe( pipe_slow );
21428 %}
21429 
21430 instruct vcmpge4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21431   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21432             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21433             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21434   match(Set dst (VectorMaskCmp src1 src2));
21435   effect(TEMP scratch);
21436   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21437             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4S" %}
21438   ins_encode %{
21439     int vector_len = 0;
21440     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21441     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21442   %}
21443   ins_pipe( pipe_slow );
21444 %}
21445 
21446 instruct vcmpge8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21447   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21448             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21449             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21450   match(Set dst (VectorMaskCmp src1 src2));
21451   effect(TEMP scratch);
21452   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21453             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8S" %}
21454   ins_encode %{
21455     int vector_len = 0;
21456     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21457     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21458   %}
21459   ins_pipe( pipe_slow );
21460 %}
21461 
21462 instruct vcmpge16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21463   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21464             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21465             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21466   match(Set dst (VectorMaskCmp src1 src2));
21467   effect(TEMP scratch);
21468   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21469             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16S" %}
21470   ins_encode %{
21471     int vector_len = 1;
21472     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21473     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21474   %}
21475   ins_pipe( pipe_slow );
21476 %}
21477 
21478 instruct vcmpge32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21479   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21480             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21481             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21482   match(Set dst (VectorMaskCmp src1 src2));
21483   effect(TEMP dst, TEMP scratch);
21484   format %{ "vpcmpnltw  k2,$src1,$src2\n\t"
21485             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed32S" %}
21486   ins_encode %{
21487     int vector_len = 2;
21488     Assembler::ComparisonPredicate cmp = Assembler::nlt;
21489     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21490     KRegister mask = k0; // The comparison itself is not being masked.
21491     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21492     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21493   %}
21494   ins_pipe( pipe_slow );
21495 %}
21496 
21497 instruct vcmple4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21498   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21499             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21500             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21501   match(Set dst (VectorMaskCmp src1 src2));
21502   effect(TEMP scratch);
21503   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21504             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4S" %}
21505   ins_encode %{
21506     int vector_len = 0;
21507     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21508     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21509   %}
21510   ins_pipe( pipe_slow );
21511 %}
21512 
21513 instruct vcmple8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21514   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21515             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21516             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21517   match(Set dst (VectorMaskCmp src1 src2));
21518   effect(TEMP scratch);
21519   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21520             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8S" %}
21521   ins_encode %{
21522     int vector_len = 0;
21523     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21524     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21525   %}
21526   ins_pipe( pipe_slow );
21527 %}
21528 
21529 instruct vcmple16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21530   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21531             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21532             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21533   match(Set dst (VectorMaskCmp src1 src2));
21534   effect(TEMP scratch);
21535   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21536             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16S" %}
21537   ins_encode %{
21538     int vector_len = 1;
21539     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21540     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21541   %}
21542   ins_pipe( pipe_slow );
21543 %}
21544 
21545 instruct vcmple32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21546   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21547             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21548             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21549   match(Set dst (VectorMaskCmp src1 src2));
21550   effect(TEMP dst, TEMP scratch);
21551   format %{ "vpcmplew  k2,$src1,$src2\n\t"
21552             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed32S" %}
21553   ins_encode %{
21554     int vector_len = 2;
21555     Assembler::ComparisonPredicate cmp = Assembler::le;
21556     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21557     KRegister mask = k0; // The comparison itself is not being masked.
21558     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21559     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21560   %}
21561   ins_pipe( pipe_slow );
21562 %}
21563 
21564 instruct vcmpne4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21565   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21566             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21567             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21568   match(Set dst (VectorMaskCmp src1 src2));
21569   effect(TEMP scratch);
21570   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21571             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4S" %}
21572   ins_encode %{
21573     int vector_len = 0;
21574     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21575     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21576   %}
21577   ins_pipe( pipe_slow );
21578 %}
21579 
21580 instruct vcmpne8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21581   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21582             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21583             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21584   match(Set dst (VectorMaskCmp src1 src2));
21585   effect(TEMP scratch);
21586   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21587             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8S" %}
21588   ins_encode %{
21589     int vector_len = 0;
21590     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21591     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21592   %}
21593   ins_pipe( pipe_slow );
21594 %}
21595 
21596 instruct vcmpne16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21597   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21598             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21599             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21600   match(Set dst (VectorMaskCmp src1 src2));
21601   effect(TEMP scratch);
21602   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21603             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16S" %}
21604   ins_encode %{
21605     int vector_len = 1;
21606     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21607     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21608   %}
21609   ins_pipe( pipe_slow );
21610 %}
21611 
21612 instruct vcmpne32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21613   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21614             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21615             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21616   match(Set dst (VectorMaskCmp src1 src2));
21617   effect(TEMP dst, TEMP scratch);
21618   format %{ "vpcmpneqw  k2,$src1,$src2\n\t"
21619             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed32S" %}
21620   ins_encode %{
21621     int vector_len = 2;
21622     Assembler::ComparisonPredicate cmp = Assembler::neq;
21623     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21624     KRegister mask = k0; // The comparison itself is not being masked.
21625     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21626     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21627   %}
21628   ins_pipe( pipe_slow );
21629 %}
21630 
21631 instruct vcmpeq1L(vecD dst, vecD src1, vecD src2) %{
21632   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21633             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21634             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21635   match(Set dst (VectorMaskCmp src1 src2));
21636   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed1L" %}
21637   ins_encode %{
21638     int vector_len = 0;
21639     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21640   %}
21641   ins_pipe( pipe_slow );
21642 %}
21643 
21644 instruct vcmpeq2L(vecX dst, vecX src1, vecX src2) %{
21645   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21646             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21647             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21648   match(Set dst (VectorMaskCmp src1 src2));
21649   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed2L" %}
21650   ins_encode %{
21651     int vector_len = 0;
21652     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21653   %}
21654   ins_pipe( pipe_slow );
21655 %}
21656 
21657 instruct vcmpeq4L(vecY dst, vecY src1, vecY src2) %{
21658   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21659             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21660             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21661   match(Set dst (VectorMaskCmp src1 src2));
21662   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed4L" %}
21663   ins_encode %{
21664     int vector_len = 1;
21665     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21666   %}
21667   ins_pipe( pipe_slow );
21668 %}
21669 
21670 instruct vcmpeq8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21671   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21672             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21673             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21674   match(Set dst (VectorMaskCmp src1 src2));
21675   effect(TEMP dst, TEMP scratch);
21676   format %{ "vpcmpeqq  k2,$src1,$src2\n\t"
21677             "vmovdqu64 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8L" %}
21678   ins_encode %{
21679     int vector_len = 2;
21680     Assembler::ComparisonPredicate cmp = Assembler::eq;
21681     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21682     KRegister mask = k0; // The comparison itself is not being masked.
21683     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21684     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21685   %}
21686   ins_pipe( pipe_slow );
21687 %}
21688 
21689 instruct vcmplt1L(vecD dst, vecD src1, vecD src2) %{
21690   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21691             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21692             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21693   match(Set dst (VectorMaskCmp src1 src2));
21694   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed1L" %}
21695   ins_encode %{
21696     int vector_len = 0;
21697     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21698   %}
21699   ins_pipe( pipe_slow );
21700 %}
21701 
21702 instruct vcmplt2L(vecX dst, vecX src1, vecX src2) %{
21703   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21704             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21705             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21706   match(Set dst (VectorMaskCmp src1 src2));
21707   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed2L" %}
21708   ins_encode %{
21709     int vector_len = 0;
21710     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21711   %}
21712   ins_pipe( pipe_slow );
21713 %}
21714 
21715 instruct vcmplt4L(vecY dst, vecY src1, vecY src2) %{
21716   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21717             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21718             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21719   match(Set dst (VectorMaskCmp src1 src2));
21720   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed4L" %}
21721   ins_encode %{
21722     int vector_len = 1;
21723     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21724   %}
21725   ins_pipe( pipe_slow );
21726 %}
21727 
21728 instruct vcmplt8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21729   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21730             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21731             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21732   match(Set dst (VectorMaskCmp src1 src2));
21733   effect(TEMP dst, TEMP scratch);
21734   format %{ "vpcmpnleq  k2,$src1,$src2\n\t"
21735             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8L" %}
21736   ins_encode %{
21737     int vector_len = 2;
21738     Assembler::ComparisonPredicate cmp = Assembler::lt;
21739     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21740     KRegister mask = k0; // The comparison itself is not being masked.
21741     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21742     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21743   %}
21744   ins_pipe( pipe_slow );
21745 %}
21746 
21747 instruct vcmpgt1L(vecD dst, vecD src1, vecD src2) %{
21748   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21749             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21750             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21751   match(Set dst (VectorMaskCmp src1 src2));
21752   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed1L" %}
21753   ins_encode %{
21754     int vector_len = 0;
21755     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21756   %}
21757   ins_pipe( pipe_slow );
21758 %}
21759 
21760 instruct vcmpgt2L(vecX dst, vecX src1, vecX src2) %{
21761   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21762             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21763             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21764   match(Set dst (VectorMaskCmp src1 src2));
21765   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed2L" %}
21766   ins_encode %{
21767     int vector_len = 0;
21768     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21769   %}
21770   ins_pipe( pipe_slow );
21771 %}
21772 
21773 instruct vcmpgt4L(vecY dst, vecY src1, vecY src2) %{
21774   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21775             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21776             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21777   match(Set dst (VectorMaskCmp src1 src2));
21778   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed4L" %}
21779   ins_encode %{
21780     int vector_len = 1;
21781     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21782   %}
21783   ins_pipe( pipe_slow );
21784 %}
21785 
21786 instruct vcmpgt8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21787   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21788             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21789             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21790   match(Set dst (VectorMaskCmp src1 src2));
21791   effect(TEMP dst, TEMP scratch);
21792   format %{ "vpcmpnleq  k2,$src1,$src2\n\t"
21793             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8L" %}
21794   ins_encode %{
21795     int vector_len = 2;
21796     Assembler::ComparisonPredicate cmp = Assembler::nle;
21797     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21798     KRegister mask = k0; // The comparison itself is not being masked.
21799     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21800     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21801   %}
21802   ins_pipe( pipe_slow );
21803 %}
21804 
21805 instruct vcmpge1L(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21806   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21807             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21808             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21809   match(Set dst (VectorMaskCmp src1 src2));
21810   effect(TEMP scratch);
21811   format %{ "vpcmpgtq  $dst,$src2,$src1\n\t"
21812             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed1L" %}
21813   ins_encode %{
21814     int vector_len = 0;
21815     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21816     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21817   %}
21818   ins_pipe( pipe_slow );
21819 %}
21820 
21821 instruct vcmpge2L(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21822   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21823             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21824             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21825   match(Set dst (VectorMaskCmp src1 src2));
21826   effect(TEMP scratch);
21827   format %{ "vpcmpgtq  $dst,$src2,$src1\n\t"
21828             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed2L" %}
21829   ins_encode %{
21830     int vector_len = 0;
21831     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21832     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21833   %}
21834   ins_pipe( pipe_slow );
21835 %}
21836 
21837 instruct vcmpge4L(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21838   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21839             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21840             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21841   match(Set dst (VectorMaskCmp src1 src2));
21842   effect(TEMP scratch);
21843   format %{ "vpcmpgtq  $dst,$src2,$src1\n\t"
21844             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4L" %}
21845   ins_encode %{
21846     int vector_len = 1;
21847     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21848     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21849   %}
21850   ins_pipe( pipe_slow );
21851 %}
21852 
21853 instruct vcmpge8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21854   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21855             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21856             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21857   match(Set dst (VectorMaskCmp src1 src2));
21858   effect(TEMP dst, TEMP scratch);
21859   format %{ "vpcmpnltq  k2,$src1,$src2\n\t"
21860             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed8L" %}
21861   ins_encode %{
21862     int vector_len = 2;
21863     Assembler::ComparisonPredicate cmp = Assembler::nlt;
21864     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21865     KRegister mask = k0; // The comparison itself is not being masked.
21866     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21867     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21868   %}
21869   ins_pipe( pipe_slow );
21870 %}
21871 
21872 instruct vcmple1L(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21873   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21874             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21875             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21876   match(Set dst (VectorMaskCmp src1 src2));
21877   effect(TEMP scratch);
21878   format %{ "vpcmpgtq  $dst,$src1,$src2\n\t"
21879             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed1L" %}
21880   ins_encode %{
21881     int vector_len = 0;
21882     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21883     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21884   %}
21885   ins_pipe( pipe_slow );
21886 %}
21887 
21888 instruct vcmple2L(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21889   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21890             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21891             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21892   match(Set dst (VectorMaskCmp src1 src2));
21893   effect(TEMP scratch);
21894   format %{ "vpcmpgtq  $dst,$src1,$src2\n\t"
21895             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed2L" %}
21896   ins_encode %{
21897     int vector_len = 0;
21898     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21899     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21900   %}
21901   ins_pipe( pipe_slow );
21902 %}
21903 
21904 instruct vcmple4L(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21905   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21906             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21907             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21908   match(Set dst (VectorMaskCmp src1 src2));
21909   effect(TEMP scratch);
21910   format %{ "vpcmpgtq  $dst,$src1,$src2\n\t"
21911             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4L" %}
21912   ins_encode %{
21913     int vector_len = 1;
21914     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21915     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21916   %}
21917   ins_pipe( pipe_slow );
21918 %}
21919 
21920 instruct vcmple8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21921   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21922             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21923             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21924   match(Set dst (VectorMaskCmp src1 src2));
21925   effect(TEMP dst, TEMP scratch);
21926   format %{ "vpcmpleq  k2,$src1,$src2\n\t"
21927             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed8L" %}
21928   ins_encode %{
21929     int vector_len = 2;
21930     Assembler::ComparisonPredicate cmp = Assembler::le;
21931     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21932     KRegister mask = k0; // The comparison itself is not being masked.
21933     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21934     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21935   %}
21936   ins_pipe( pipe_slow );
21937 %}
21938 
21939 instruct vcmpne1L(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21940   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21941             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21942             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21943   match(Set dst (VectorMaskCmp src1 src2));
21944   effect(TEMP scratch);
21945   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t"
21946             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed1L" %}
21947   ins_encode %{
21948     int vector_len = 0;
21949     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21950     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21951   %}
21952   ins_pipe( pipe_slow );
21953 %}
21954 
21955 instruct vcmpne2L(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21956   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21957             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21958             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21959   match(Set dst (VectorMaskCmp src1 src2));
21960   effect(TEMP scratch);
21961   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t"
21962             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed2L" %}
21963   ins_encode %{
21964     int vector_len = 0;
21965     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21966     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21967   %}
21968   ins_pipe( pipe_slow );
21969 %}
21970 
21971 instruct vcmpne4L(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21972   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21973             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21974             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21975   match(Set dst (VectorMaskCmp src1 src2));
21976   effect(TEMP scratch);
21977   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t"
21978             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4L" %}
21979   ins_encode %{
21980     int vector_len = 1;
21981     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21982     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21983   %}
21984   ins_pipe( pipe_slow );
21985 %}
21986 
21987 instruct vcmpne8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21988   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21989             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21990             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21991   match(Set dst (VectorMaskCmp src1 src2));
21992   effect(TEMP dst, TEMP scratch);
21993   format %{ "vpcmpneqq  k2,$src1,$src2\n\t"
21994             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed8L" %}
21995   ins_encode %{
21996     int vector_len = 2;
21997     Assembler::ComparisonPredicate cmp = Assembler::neq;
21998     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21999     KRegister mask = k0; // The comparison itself is not being masked.
22000     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
22001     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
22002   %}
22003   ins_pipe( pipe_slow );
22004 %}
22005 
22006 instruct blendvps2F(vecD dst, vecD src, vecD mask, rxmm0 xmm_0) %{
22007   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
22008   match(Set dst (VectorBlend (Binary dst src) mask));
22009   effect(TEMP xmm_0);
22010   format %{ "blendvps  $dst,$src,$mask\t! packed2F" %}
22011   ins_encode %{
22012     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22013       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22014     }
22015     __ blendvps($dst$$XMMRegister, $src$$XMMRegister);
22016   %}
22017   ins_pipe( pipe_slow );
22018 %}
22019 
22020 instruct vblendvps2F(vecD dst, vecD src1, vecD src2, vecD mask) %{
22021   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
22022   match(Set dst (VectorBlend (Binary src1 src2) mask));
22023   format %{ "vblendvps  $dst,$src1,$src2,$mask\t! packed2F" %}
22024   ins_encode %{
22025     int vector_len = 0;
22026     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22027   %}
22028   ins_pipe( pipe_slow );
22029 %}
22030 
22031 instruct blendvps4F(vecX dst, vecX src, vecX mask, rxmm0 xmm_0) %{
22032   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
22033   match(Set dst (VectorBlend (Binary dst src) mask));
22034   effect(TEMP xmm_0);
22035   format %{ "blendvps  $dst,$src,$mask\t! packed4F" %}
22036   ins_encode %{
22037     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22038       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22039     }
22040     __ blendvps($dst$$XMMRegister, $src$$XMMRegister);
22041   %}
22042   ins_pipe( pipe_slow );
22043 %}
22044 
22045 instruct vblendvps4F(vecX dst, vecX src1, vecX src2, vecX mask) %{
22046   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
22047   match(Set dst (VectorBlend (Binary src1 src2) mask));
22048   format %{ "vblendvps  $dst,$src1,$src2,$mask\t! packed4F" %}
22049   ins_encode %{
22050     int vector_len = 0;
22051     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22052   %}
22053   ins_pipe( pipe_slow );
22054 %}
22055 
22056 instruct vblendvps8F(vecY dst, vecY src1, vecY src2, vecY mask) %{
22057   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
22058   match(Set dst (VectorBlend (Binary src1 src2) mask));
22059   format %{ "vblendvps  $dst,$src1,$src2,$mask\t! packed8F" %}
22060   ins_encode %{
22061     int vector_len = 1;
22062     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22063   %}
22064   ins_pipe( pipe_slow );
22065 %}
22066 
22067 instruct vblendvps16F(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
22068   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
22069   match(Set dst (VectorBlend (Binary src1 src2) mask));
22070   effect(TEMP scratch);
22071   format %{ "vpcmpeqd  k2,$mask,0xFFFFFFFF\n\t"
22072            "vblendmps $dst,k2,$src1,$src2\t! blend packed16F " %}
22073   ins_encode %{
22074     int vector_len = 2;
22075     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
22076     __ evpcmpeqd(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
22077     __ evblendmps($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
22078   %}
22079   ins_pipe( pipe_slow );
22080 %}
22081 
22082 instruct vblendvpd8D(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
22083   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
22084   match(Set dst (VectorBlend (Binary src1 src2) mask));
22085   effect(TEMP scratch);
22086   format %{ "evpcmpeqq  k2,$mask,0xFFFFFFFF\n\t"
22087            "vblendmpd $dst,k2,$src1,$src2\t! blend packed16F " %}
22088   ins_encode %{
22089     int vector_len = 2;
22090     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
22091     __ evpcmpq(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
22092     __ evblendmpd($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
22093   %}
22094   ins_pipe( pipe_slow );
22095 %}
22096 
22097 instruct vpblendmb64B(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
22098   predicate(UseAVX > 2 && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE && VM_Version::supports_avx512bw());
22099   match(Set dst (VectorBlend (Binary src1 src2) mask));
22100   effect(TEMP scratch);
22101   format %{ "vpcmpeqb  k2,$mask,0xFFFFFFFF\n\t"
22102            "vpblendmb $dst,k2,$src1,$src2\t! blend packed64B " %}
22103   ins_encode %{
22104     int vector_len = 2;
22105     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
22106     __ evpcmpb(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
22107     __ evpblendmb($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
22108   %}
22109   ins_pipe( pipe_slow );
22110 %}
22111 
22112 instruct vpblendmw32S(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
22113   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT && VM_Version::supports_avx512bw());
22114   match(Set dst (VectorBlend (Binary src1 src2) mask));
22115   effect(TEMP scratch);
22116   format %{ "vpcmpeqw  k2,$mask,0xFFFFFFFF\n\t"
22117            "vpblendmw $dst,k2,$src1,$src2\t! blend packed32S " %}
22118   ins_encode %{
22119     int vector_len = 2;
22120     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
22121     __ evpcmpw(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
22122     __ evpblendmw($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
22123   %}
22124   ins_pipe( pipe_slow );
22125 %}
22126 
22127 instruct vpblendmd16I(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
22128   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22129   match(Set dst (VectorBlend (Binary src1 src2) mask));
22130   effect(TEMP scratch);
22131   format %{ "vpcmpeqd  k2,$mask,0xFFFFFFFF\n\t"
22132            "vpblendmd $dst,k2,$src1,$src2\t! blend packed16I " %}
22133   ins_encode %{
22134     int vector_len = 2;
22135     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
22136     __ evpcmpd(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
22137     __ evpblendmd($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
22138   %}
22139   ins_pipe( pipe_slow );
22140 %}
22141 
22142 instruct vpblendmq8L(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
22143   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22144   match(Set dst (VectorBlend (Binary src1 src2) mask));
22145   effect(TEMP scratch);
22146   format %{ "vpcmpeqq  k2,$mask,0xFFFFFFFF\n\t"
22147            "vpblendmq $dst,k2,$src1,$src2\t! blend packed8L " %}
22148   ins_encode %{
22149     int vector_len = 2;
22150     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
22151     __ evpcmpq(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
22152     __ evpblendmq($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
22153   %}
22154   ins_pipe( pipe_slow );
22155 %}
22156 
22157 
22158 instruct pblendvb2I(vecD dst, vecD src, vecD mask, rxmm0 xmm_0) %{
22159   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22160   match(Set dst (VectorBlend (Binary dst src) mask));
22161   effect(TEMP xmm_0);
22162   format %{ "vpblendvb  $dst,$src,$mask\t! blend packed2I" %}
22163   ins_encode %{
22164     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22165       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22166     }
22167     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22168   %}
22169   ins_pipe( pipe_slow );
22170 %}
22171 
22172 instruct vpblendvb2I(vecD dst, vecD src1, vecD src2, vecD mask) %{
22173   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22174   match(Set dst (VectorBlend (Binary src1 src2) mask));
22175   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed2I" %}
22176   ins_encode %{
22177     int vector_len = 0;
22178     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22179   %}
22180   ins_pipe( pipe_slow );
22181 %}
22182 
22183 instruct pblendvb4I(vecX dst, vecX src, vecX mask, rxmm0 xmm_0) %{
22184   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22185   match(Set dst (VectorBlend (Binary dst src) mask));
22186   effect(TEMP xmm_0);
22187   format %{ "vpblendvb  $dst,$src,$mask\t! blend packed4I" %}
22188   ins_encode %{
22189     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22190       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22191     }
22192     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22193   %}
22194   ins_pipe( pipe_slow );
22195 %}
22196 
22197 instruct vpblendvb4I(vecX dst, vecX src1, vecX src2, vecX mask) %{
22198   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22199   match(Set dst (VectorBlend (Binary src1 src2) mask));
22200   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed4I" %}
22201   ins_encode %{
22202     int vector_len = 0;
22203     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22204   %}
22205   ins_pipe( pipe_slow );
22206 %}
22207 
22208 instruct vpblendvb8I(vecY dst, vecY src1, vecY src2, vecY mask) %{
22209   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22210   match(Set dst (VectorBlend (Binary src1 src2) mask));
22211   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed8I" %}
22212   ins_encode %{
22213     int vector_len = 1;
22214     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22215   %}
22216   ins_pipe( pipe_slow );
22217 %}
22218 
22219 instruct pblendvb8B(vecD dst, vecD src, vecD mask, rxmm0 xmm_0) %{
22220   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22221   match(Set dst (VectorBlend (Binary dst src) mask));
22222   effect(TEMP xmm_0);
22223   format %{ "pblendvb  $dst,$src,$mask\t! blend packed8B" %}
22224   ins_encode %{
22225     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22226       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22227     }
22228     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22229   %}
22230   ins_pipe( pipe_slow );
22231 %}
22232 
22233 instruct vpblendvb8B(vecD dst, vecD src1, vecD src2, vecD mask) %{
22234   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22235   match(Set dst (VectorBlend (Binary src1 src2) mask));
22236   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed8B" %}
22237   ins_encode %{
22238     int vector_len = 0;
22239     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22240   %}
22241   ins_pipe( pipe_slow );
22242 %}
22243 
22244 instruct pblendvb16B(vecX dst, vecX src, vecX mask, rxmm0 xmm_0) %{
22245   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22246   match(Set dst (VectorBlend (Binary dst src) mask));
22247   effect(TEMP xmm_0);
22248   format %{ "pblendvb  $dst,$src,$mask\t! blend packed16B" %}
22249   ins_encode %{
22250     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22251       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22252     }
22253     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22254   %}
22255   ins_pipe( pipe_slow );
22256 %}
22257 
22258 instruct vpblendvb16B(vecX dst, vecX src1, vecX src2, vecX mask) %{
22259   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22260   match(Set dst (VectorBlend (Binary src1 src2) mask));
22261   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed16B" %}
22262   ins_encode %{
22263     int vector_len = 0;
22264     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22265   %}
22266   ins_pipe( pipe_slow );
22267 %}
22268 
22269 instruct vpblendvb32B(vecY dst, vecY src1, vecY src2, vecY mask) %{
22270   predicate(UseAVX >= 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22271   match(Set dst (VectorBlend (Binary src1 src2) mask));
22272   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed32B" %}
22273   ins_encode %{
22274     int vector_len = 1;
22275     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22276   %}
22277   ins_pipe( pipe_slow );
22278 %}
22279 
22280 instruct pblendvb4S(vecD dst, vecD src, vecD mask, rxmm0 xmm_0) %{
22281   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22282   match(Set dst (VectorBlend (Binary dst src) mask));
22283   effect(TEMP xmm_0);
22284   format %{ "pblendvb  $dst,$src,$mask\t! blend packed4S" %}
22285   ins_encode %{
22286     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22287       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22288     }
22289     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22290   %}
22291   ins_pipe( pipe_slow );
22292 %}
22293 
22294 instruct vpblendvb4S(vecD dst, vecD src1, vecD src2, vecD mask) %{
22295   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22296   match(Set dst (VectorBlend (Binary src1 src2) mask));
22297   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed4S" %}
22298   ins_encode %{
22299     int vector_len = 0;
22300     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22301   %}
22302   ins_pipe( pipe_slow );
22303 %}
22304 
22305 instruct pblendvb8S(vecX dst, vecX src, vecX mask, rxmm0 xmm_0) %{
22306   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22307   match(Set dst (VectorBlend (Binary dst src) mask));
22308   effect(TEMP xmm_0);
22309   format %{ "pblendvb  $dst,$src,$mask\t! blend packed8S" %}
22310   ins_encode %{
22311     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22312       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22313     }
22314     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22315   %}
22316   ins_pipe( pipe_slow );
22317 %}
22318 
22319 instruct vpblendvb8S(vecX dst, vecX src1, vecX src2, vecX mask) %{
22320   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22321   match(Set dst (VectorBlend (Binary src1 src2) mask));
22322   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed8S" %}
22323   ins_encode %{
22324     int vector_len = 0;
22325     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22326   %}
22327   ins_pipe( pipe_slow );
22328 %}
22329 
22330 instruct vpblendvb16S(vecY dst, vecY src1, vecY src2, vecY mask) %{
22331   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22332   match(Set dst (VectorBlend (Binary src1 src2) mask));
22333   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed16S" %}
22334   ins_encode %{
22335     int vector_len = 1;
22336     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22337   %}
22338   ins_pipe( pipe_slow );
22339 %}
22340 
22341 instruct pblendvb1L(vecD dst, vecD src, vecD mask, rxmm0 xmm_0) %{
22342   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22343   match(Set dst (VectorBlend (Binary dst src) mask));
22344   effect(TEMP xmm_0);
22345   format %{ "pblendvb  $dst,$src,$mask\t! blend packed1L" %}
22346   ins_encode %{
22347     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22348       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22349     }
22350     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22351   %}
22352   ins_pipe( pipe_slow );
22353 %}
22354 
22355 instruct vpblendvb1L(vecD dst, vecD src1, vecD src2, vecD mask) %{
22356   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22357   match(Set dst (VectorBlend (Binary src1 src2) mask));
22358   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed1L" %}
22359   ins_encode %{
22360     int vector_len = 0;
22361     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22362   %}
22363   ins_pipe( pipe_slow );
22364 %}
22365 
22366 instruct pblendvb2L(vecX dst, vecX src, vecX mask, rxmm0 xmm_0) %{
22367   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22368   match(Set dst (VectorBlend (Binary dst src) mask));
22369   effect(TEMP xmm_0);
22370   format %{ "pblendvb  $dst,$src,$mask\t! blend packed2L" %}
22371   ins_encode %{
22372     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22373       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22374     }
22375     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
22376   %}
22377   ins_pipe( pipe_slow );
22378 %}
22379 
22380 instruct vpblendvb2L(vecX dst, vecX src1, vecX src2, vecX mask) %{
22381   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22382   match(Set dst (VectorBlend (Binary src1 src2) mask));
22383   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed2L" %}
22384   ins_encode %{
22385     int vector_len = 0;
22386     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22387   %}
22388   ins_pipe( pipe_slow );
22389 %}
22390 
22391 instruct vpblendvb4L(vecY dst, vecY src1, vecY src2, vecY mask) %{
22392   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22393   match(Set dst (VectorBlend (Binary src1 src2) mask));
22394   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed4L" %}
22395   ins_encode %{
22396     int vector_len = 1;
22397     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22398   %}
22399   ins_pipe( pipe_slow );
22400 %}
22401 
22402 instruct blendvpd1D(vecD dst, vecD src, vecD mask, rxmm0 xmm_0) %{
22403   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
22404   match(Set dst (VectorBlend (Binary dst src) mask));
22405   effect(TEMP xmm_0);
22406   format %{ "blendvpd  $dst,$src,$mask\t! packed1D" %}
22407   ins_encode %{
22408     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22409       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22410     }
22411     __ blendvpd($dst$$XMMRegister, $src$$XMMRegister);
22412   %}
22413   ins_pipe( pipe_slow );
22414 %}
22415 
22416 instruct vblendvpd1D(vecD dst, vecD src1, vecD src2, vecD mask) %{
22417   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
22418   match(Set dst (VectorBlend (Binary src1 src2) mask));
22419   format %{ "vblendvpd  $dst,$src1,$src2,$mask\t! packed1D" %}
22420   ins_encode %{
22421     int vector_len = 0;
22422     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22423   %}
22424   ins_pipe( pipe_slow );
22425 %}
22426 
22427 instruct blendvpd2D(vecX dst, vecX src, vecX mask, rxmm0 xmm_0) %{
22428   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
22429   match(Set dst (VectorBlend (Binary dst src) mask));
22430   effect(TEMP xmm_0);
22431   format %{ "blendvpd  $dst,$src,$mask\t! packed2D" %}
22432   ins_encode %{
22433     if ($mask$$XMMRegister != $xmm_0$$XMMRegister) {
22434       __ movdqu($xmm_0$$XMMRegister, $mask$$XMMRegister);
22435     }
22436     __ blendvpd($dst$$XMMRegister, $src$$XMMRegister);
22437   %}
22438   ins_pipe( pipe_slow );
22439 %}
22440 
22441 instruct vblendvpd2D(vecX dst, vecX src1, vecX src2, vecX mask) %{
22442   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
22443   match(Set dst (VectorBlend (Binary src1 src2) mask));
22444   format %{ "vblendvpd  $dst,$src1,$src2,$mask\t! packed2D" %}
22445   ins_encode %{
22446     int vector_len = 0;
22447     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22448   %}
22449   ins_pipe( pipe_slow );
22450 %}
22451 
22452 instruct vblendvpd4D(vecY dst, vecY src1, vecY src2, vecY mask) %{
22453   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
22454   match(Set dst (VectorBlend (Binary src1 src2) mask));
22455   format %{ "vblendvpd  $dst,$src1,$src2,$mask\t! packed4D" %}
22456   ins_encode %{
22457     int vector_len = 1;
22458     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
22459   %}
22460   ins_pipe( pipe_slow );
22461 %}
22462 
22463 // --------------------------------- NEG --------------------------------------
22464 // a = -a
22465 instruct vneg2I_reg(vecD dst, vecD src) %{
22466   predicate(UseSSE > 1 && n->as_Vector()->length() == 2);
22467   match(Set dst (NegVI  src));
22468   effect(TEMP dst);
22469   format %{ "pxor   $dst,$dst\n\t"
22470             "psubd  $dst, $src\t! neg packed2I" %}
22471   ins_cost(150);
22472   ins_encode %{
22473     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
22474     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
22475   %}
22476   ins_pipe( pipe_slow );
22477 %}
22478 
22479 instruct vneg4I_reg(vecX dst, vecX src) %{
22480   predicate(UseSSE > 1 && n->as_Vector()->length() == 4);
22481   match(Set dst (NegVI  src));
22482   effect(TEMP dst);
22483   format %{ "pxor   $dst,$dst\n\t"
22484             "psubd  $dst, $src\t! neg packed4I" %}
22485   ins_cost(150);
22486   ins_encode %{
22487     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
22488     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
22489   %}
22490   ins_pipe( pipe_slow );
22491 %}
22492 
22493 instruct vneg8I_reg(vecY dst, vecY src, vecY tmp) %{
22494   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
22495   match(Set dst (NegVI  src));
22496   effect(TEMP tmp);
22497   format %{ "vpxor   $tmp,$tmp,$tmp\n\t"
22498             "vpsubd  $dst,$tmp,$src\t! neg packed8I" %}
22499   ins_cost(150);
22500   ins_encode %{
22501     int vector_len = 1;
22502     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
22503     __ vpsubd($dst$$XMMRegister, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
22504   %}
22505   ins_pipe( pipe_slow );
22506 %}
22507 
22508 instruct vneg16I_reg(vecZ dst, vecZ src, vecZ tmp) %{
22509   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
22510   match(Set dst (NegVI  src));
22511   effect(TEMP tmp);
22512   format %{ "vpxor   $tmp,$tmp,$tmp\n\t"
22513             "vpsubd  $dst,$tmp,$src\t! neg packed16I" %}
22514   ins_cost(150);
22515   ins_encode %{
22516     int vector_len = 2;
22517     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
22518     __ vpsubd($dst$$XMMRegister, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
22519   %}
22520   ins_pipe( pipe_slow );
22521 %}
22522 
22523 instruct vneg1D(regD dst) %{
22524   predicate((UseSSE>=2) && (UseAVX == 0));
22525   match(Set dst (NegVD dst));
22526   ins_cost(150);
22527   format %{ "xorpd $dst,[0x8000000000000000] \t# $dst = -$dst neg packed1D" %}
22528   ins_encode %{
22529     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
22530   %}
22531   ins_pipe(pipe_slow);
22532 %}
22533 
22534 instruct vneg1D_reg(vecX dst, vecX src) %{
22535   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
22536   match(Set dst (NegVD  src));
22537   format %{ "vxorpd $dst,$src\t# $dst = -$src neg packed1D" %}
22538   ins_cost(150);
22539   ins_encode %{
22540     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
22541                  ExternalAddress(double_signflip()));
22542   %}
22543   ins_pipe( pipe_slow );
22544 %}
22545 
22546 instruct vneg2D_reg(vecX dst) %{
22547   predicate((UseSSE>=2));
22548   match(Set dst (NegVD dst));
22549   ins_cost(150);
22550   format %{ "xorpd $dst,[0x8000000000000000]\t# $dst = -$dst neg packed2D" %}
22551   ins_encode %{
22552     __ xorpd($dst$$XMMRegister, ExternalAddress(vector_double_signflip()));
22553   %}
22554   ins_pipe(pipe_slow);
22555 %}
22556 
22557 
22558 instruct vneg4D_reg(vecY dst, vecY src) %{
22559   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
22560   match(Set dst (NegVD  src));
22561   format %{ "vxorpd $dst,$src\t# $dst = -$src neg packed4D" %}
22562   ins_cost(150);
22563   ins_encode %{
22564     int vector_len = 1;
22565     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signflip()), vector_len);
22566   %}
22567   ins_pipe( pipe_slow );
22568 %}
22569 
22570 instruct vneg8D_reg(vecZ dst, vecZ src) %{
22571   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
22572   match(Set dst (NegVD  src));
22573   format %{ "vxorpd $dst,$src\t# $dst = -$src neg packed8D" %}
22574   ins_cost(150);
22575   ins_encode %{
22576     int vector_len = 2;
22577     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signflip()), vector_len);
22578   %}
22579   ins_pipe( pipe_slow );
22580 %}
22581 
22582 instruct vneg2F_reg(vecD dst) %{
22583   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
22584   match(Set dst (NegVF dst));
22585   format %{ "xorps $dst,[0x80000000]\t# $dst = -$dst neg packed2F" %}
22586   ins_cost(150);
22587   ins_encode %{
22588     __ xorps($dst$$XMMRegister, ExternalAddress(vector_float_signflip()));
22589   %}
22590   ins_pipe( pipe_slow );
22591 %}
22592 
22593 instruct vneg4F_reg(vecX dst) %{
22594   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
22595   match(Set dst (NegVF dst));
22596   format %{ "xorps $dst,[0x80000000]\t# $dst = -$dst neg packed4F" %}
22597   ins_cost(150);
22598   ins_encode %{
22599     __ xorps($dst$$XMMRegister, ExternalAddress(vector_float_signflip()));
22600   %}
22601   ins_pipe( pipe_slow );
22602 %}
22603 
22604 instruct vneg8F_reg(vecY dst, vecY src) %{
22605   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
22606   match(Set dst (NegVF  src));
22607   format %{ "vxorps $dst,$src\t# $dst = -$src neg packed8F" %}
22608   ins_cost(150);
22609   ins_encode %{
22610     int vector_len = 1;
22611     __ vxorps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signflip()), vector_len);
22612   %}
22613   ins_pipe( pipe_slow );
22614 %}
22615 
22616 instruct vneg16F_reg(vecZ dst, vecZ src) %{
22617   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
22618   match(Set dst (NegVF  src));
22619   format %{ "vxorps $dst,$src\t# $dst = -$src neg packed16F" %}
22620   ins_cost(150);
22621   ins_encode %{
22622     int vector_len = 2;
22623     __ vxorps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signflip()), vector_len);
22624   %}
22625   ins_pipe( pipe_slow );
22626 %}
22627 
22628 // --------------------------------- ABS --------------------------------------
22629 // a = |a|
22630 instruct vabs8B_reg(vecD dst, vecD src) %{
22631   predicate(UseSSE > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22632   match(Set dst (AbsV  src));
22633   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %}
22634   ins_cost(150);
22635   ins_encode %{
22636     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
22637   %}
22638   ins_pipe( pipe_slow );
22639 %}
22640 
22641 instruct vabs16B_reg(vecX dst, vecX src) %{
22642   predicate(UseSSE > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22643   match(Set dst (AbsV  src));
22644   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %}
22645   ins_cost(150);
22646   ins_encode %{
22647     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
22648   %}
22649   ins_pipe( pipe_slow );
22650 %}
22651 
22652 instruct vabs32B_reg(vecY dst, vecY src) %{
22653   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22654   match(Set dst (AbsV  src));
22655   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %}
22656   ins_cost(150);
22657   ins_encode %{
22658     int vector_len = 1;
22659     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22660   %}
22661   ins_pipe( pipe_slow );
22662 %}
22663 
22664 instruct vabs64B_reg(vecZ dst, vecZ src) %{
22665   predicate(UseAVX > 2 && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
22666   match(Set dst (AbsV  src));
22667   format %{ "evpabsb $dst,$src\t# $dst = |$src| abs packed64B" %}
22668   ins_cost(150);
22669   ins_encode %{
22670     int vector_len = 2;
22671     __ evpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22672   %}
22673   ins_pipe( pipe_slow );
22674 %}
22675 
22676 instruct vabs4S_reg(vecD dst, vecD src) %{
22677   predicate(UseSSE > 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22678   match(Set dst (AbsV  src));
22679   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %}
22680   ins_cost(150);
22681   ins_encode %{
22682     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
22683   %}
22684   ins_pipe( pipe_slow );
22685 %}
22686 
22687 instruct vabs8S_reg(vecX dst, vecX src) %{
22688   predicate(UseSSE > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22689   match(Set dst (AbsV  src));
22690   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %}
22691   ins_cost(150);
22692   ins_encode %{
22693     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
22694   %}
22695   ins_pipe( pipe_slow );
22696 %}
22697 
22698 instruct vabs16S_reg(vecY dst, vecY src) %{
22699   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22700   match(Set dst (AbsV  src));
22701   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %}
22702   ins_cost(150);
22703   ins_encode %{
22704     int vector_len = 1;
22705     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22706   %}
22707   ins_pipe( pipe_slow );
22708 %}
22709 
22710 instruct vabs32S_reg(vecZ dst, vecZ src) %{
22711   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
22712   match(Set dst (AbsV  src));
22713   format %{ "evpabsw $dst,$src\t# $dst = |$src| abs packed32S" %}
22714   ins_cost(150);
22715   ins_encode %{
22716     int vector_len = 2;
22717     __ evpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22718   %}
22719   ins_pipe( pipe_slow );
22720 %}
22721 
22722 instruct vabs2I_reg(vecD dst, vecD src) %{
22723   predicate(UseSSE > 2 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22724   match(Set dst (AbsV  src));
22725   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %}
22726   ins_cost(150);
22727   ins_encode %{
22728     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
22729   %}
22730   ins_pipe( pipe_slow );
22731 %}
22732 
22733 instruct vabs4I_reg(vecX dst, vecX src) %{
22734   predicate(UseSSE > 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22735   match(Set dst (AbsV  src));
22736   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %}
22737   ins_cost(150);
22738   ins_encode %{
22739     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
22740   %}
22741   ins_pipe( pipe_slow );
22742 %}
22743 
22744 instruct vabs8I_reg(vecY dst, vecY src) %{
22745   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22746   match(Set dst (AbsV  src));
22747   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %}
22748   ins_cost(150);
22749   ins_encode %{
22750     int vector_len = 1;
22751     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22752   %}
22753   ins_pipe( pipe_slow );
22754 %}
22755 
22756 instruct vabs16I_reg(vecZ dst, vecZ src) %{
22757   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
22758   match(Set dst (AbsV  src));
22759   format %{ "evpabsd $dst,$src\t# $dst = |$src| abs packed16I" %}
22760   ins_cost(150);
22761   ins_encode %{
22762     int vector_len = 2;
22763     __ evpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22764   %}
22765   ins_pipe( pipe_slow );
22766 %}
22767 
22768 instruct vabs2L_reg(vecX dst, vecX src) %{
22769   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22770   match(Set dst (AbsV  src));
22771   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %}
22772   ins_cost(150);
22773   ins_encode %{
22774     int vector_len = 0;
22775     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22776   %}
22777   ins_pipe( pipe_slow );
22778 %}
22779 
22780 instruct vabs4L_reg(vecY dst, vecY src) %{
22781   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22782   match(Set dst (AbsV  src));
22783   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %}
22784   ins_cost(150);
22785   ins_encode %{
22786     int vector_len = 1;
22787     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22788   %}
22789   ins_pipe( pipe_slow );
22790 %}
22791 
22792 instruct vabs8L_reg(vecZ dst, vecZ src) %{
22793   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
22794   match(Set dst (AbsV  src));
22795   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %}
22796   ins_cost(150);
22797   ins_encode %{
22798     int vector_len = 2;
22799     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
22800   %}
22801   ins_pipe( pipe_slow );
22802 %}
22803 
22804 instruct vabs1D_reg(vecD dst) %{
22805   predicate(UseSSE > 0 && n->as_Vector()->length() == 1);
22806   match(Set dst (AbsVD  dst));
22807   format %{ "andpd $dst,[0x7FFFFFFFFFFFFFFF]\t# $dst = |$dst| abs packed1D" %}
22808   ins_cost(150);
22809   ins_encode %{
22810     __ andpd($dst$$XMMRegister, ExternalAddress(vector_double_signmask()));
22811   %}
22812   ins_pipe( pipe_slow );
22813 %}
22814 
22815 instruct vabs2D_reg(vecX dst) %{
22816   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
22817   match(Set dst (AbsVD  dst));
22818   format %{ "andpd $dst,[0x7FFFFFFFFFFFFFFF]\t# $dst = |$dst| abs packed2D" %}
22819   ins_cost(150);
22820   ins_encode %{
22821     __ andpd($dst$$XMMRegister, ExternalAddress(vector_double_signmask()));
22822   %}
22823   ins_pipe( pipe_slow );
22824 %}
22825 
22826 instruct vabs4D_reg(vecY dst, vecY src) %{
22827   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
22828   match(Set dst (AbsVD  src));
22829   format %{ "vandpd $dst,$src\t# $dst = |$src| abs packed4D" %}
22830   ins_cost(150);
22831   ins_encode %{
22832     int vector_len = 1;
22833     __ vandpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signmask()), vector_len);
22834   %}
22835   ins_pipe( pipe_slow );
22836 %}
22837 
22838 instruct vabs8D_reg(vecZ dst, vecZ src) %{
22839   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
22840   match(Set dst (AbsVD  src));
22841   format %{ "vandpd $dst,$src\t# $dst = |$src| abs packed8D" %}
22842   ins_cost(150);
22843   ins_encode %{
22844     int vector_len = 2;
22845     __ vandpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signmask()), vector_len);
22846   %}
22847   ins_pipe( pipe_slow );
22848 %}
22849 
22850 instruct vabs2F_reg(vecD dst) %{
22851   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
22852   match(Set dst (AbsVF  dst));
22853   format %{ "andps $dst,[0x7FFFFFFF]\t# $dst = |$dst| abs packed2F" %}
22854   ins_cost(150);
22855   ins_encode %{
22856     __ andps($dst$$XMMRegister, ExternalAddress(vector_float_signmask()));
22857   %}
22858   ins_pipe( pipe_slow );
22859 %}
22860 
22861 instruct vabs4F_reg(vecX dst) %{
22862   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
22863   match(Set dst (AbsVF  dst));
22864   format %{ "vandps $dst,[0x7FFFFFFF]\t# $dst = |$dst| abs packed4F" %}
22865   ins_cost(150);
22866   ins_encode %{
22867     __ andps($dst$$XMMRegister, ExternalAddress(vector_float_signmask()));
22868   %}
22869   ins_pipe( pipe_slow );
22870 %}
22871 
22872 instruct vabs8F_reg(vecY dst, vecY src) %{
22873   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
22874   match(Set dst (AbsVF  src));
22875   format %{ "vandps $dst,$src\t# $dst = |$src| abs packed8F" %}
22876   ins_cost(150);
22877   ins_encode %{
22878     int vector_len = 1;
22879     __ vandps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signmask()), vector_len);
22880   %}
22881   ins_pipe( pipe_slow );
22882 %}
22883 
22884 instruct vabs16F_reg(vecZ dst, vecZ src) %{
22885   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
22886   match(Set dst (AbsVF  src));
22887   format %{ "vandps $dst,$src\t# $dst = |$src| abs packed16F" %}
22888   ins_cost(150);
22889   ins_encode %{
22890     int vector_len = 2;
22891     __ vandps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signmask()), vector_len);
22892   %}
22893   ins_pipe( pipe_slow );
22894 %}
22895 
22896 //------------------------------------- NOT --------------------------------------------
22897 instruct vnot4B(vecS dst, vecS src) %{
22898   predicate(UseSSE > 1 && n->as_Vector()->length_in_bytes() == 4);
22899   match(Set dst (NotV src));
22900   effect(TEMP dst);
22901   format %{ "pxor    $dst,$src\t! not vectors (4 bytes)" %}
22902   ins_encode %{
22903     __ movdl($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
22904     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
22905   %}
22906   ins_pipe( pipe_slow );
22907 %}
22908 
22909 instruct vnot4B_reg(vecS dst, vecS src, rRegL scratch) %{
22910   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
22911   match(Set dst (NotV src));
22912   effect(TEMP scratch);
22913   format %{ "vpxor   $dst,$src\t! not vectors (4 bytes)" %}
22914   ins_encode %{
22915     int vector_len = 0;
22916     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
22917   %}
22918   ins_pipe( pipe_slow );
22919 %}
22920 
22921 instruct vnot8B(vecD dst, vecD src) %{
22922   predicate(UseSSE > 1 && n->as_Vector()->length_in_bytes() == 8);
22923   match(Set dst (NotV src));
22924   effect(TEMP dst);
22925   format %{ "pxor    $dst,$src\t! not vectors (8 bytes)" %}
22926   ins_encode %{
22927     __ movq($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
22928     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
22929   %}
22930   ins_pipe( pipe_slow );
22931 %}
22932 
22933 instruct vnot8B_reg(vecD dst, vecD src, rRegL scratch) %{
22934   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
22935   match(Set dst (NotV src));
22936   effect(TEMP scratch);
22937   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (8 bytes)" %}
22938   ins_encode %{
22939     int vector_len = 0;
22940     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
22941   %}
22942   ins_pipe( pipe_slow );
22943 %}
22944 
22945 instruct vnot16B(vecX dst, vecX src) %{
22946   predicate(UseSSE > 1 && n->as_Vector()->length_in_bytes() == 16);
22947   match(Set dst (NotV src));
22948   effect(TEMP dst);
22949   format %{ "pxor    $dst,$src\t! not vectors (16 bytes)" %}
22950   ins_encode %{
22951     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
22952     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
22953   %}
22954   ins_pipe( pipe_slow );
22955 %}
22956 
22957 instruct vnot16B_reg(vecX dst, vecX src, rRegL scratch) %{
22958   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
22959   match(Set dst (NotV src));
22960   effect(TEMP scratch);
22961   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (16 bytes)" %}
22962   ins_encode %{
22963     int vector_len = 0;
22964     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
22965   %}
22966   ins_pipe( pipe_slow );
22967 %}
22968 
22969 instruct vnot32B_reg(vecY dst, vecY src, rRegL scratch) %{
22970   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 32);
22971   match(Set dst (NotV  src));
22972   effect(TEMP scratch);
22973   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (32 bytes)" %}
22974   ins_encode %{
22975     int vector_len = 1;
22976     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
22977   %}
22978   ins_pipe( pipe_slow );
22979 %}
22980 
22981 instruct vnot64B_reg(vecZ dst, vecZ src, rRegL scratch) %{
22982   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
22983   match(Set dst (NotV src));
22984   effect(TEMP scratch);
22985   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (64 bytes)" %}
22986   ins_encode %{
22987     int vector_len = 2;
22988     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
22989   %}
22990   ins_pipe( pipe_slow );
22991 %}
22992 
22993 instruct vptest4inae(rRegI dst, vecX src1, vecX src2) %{
22994   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::carrySet);
22995   match(Set dst (VectorTest src1 src2 ));
22996   format %{ "vptest  $src1,$src2\n\t"
22997             "setb  $dst\t!" %}
22998   ins_encode %{
22999     int vector_len = 0;
23000     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
23001     __ setb(Assembler::carrySet, $dst$$Register);
23002     __ movzbl($dst$$Register, $dst$$Register);
23003   %}
23004   ins_pipe( pipe_slow );
23005 %}
23006 
23007 instruct vptest4ieq(rRegI dst, vecX src1, vecX src2) %{
23008   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::notZero);
23009   match(Set dst (VectorTest src1 src2 ));
23010   format %{ "vptest  $src1,$src2\n\t"
23011             "setb  $dst\t!" %}
23012   ins_encode %{
23013     int vector_len = 0;
23014     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
23015     __ setb(Assembler::notZero, $dst$$Register);
23016     __ movzbl($dst$$Register, $dst$$Register);
23017   %}
23018   ins_pipe( pipe_slow );
23019 %}
23020 
23021 instruct vptest8inae(rRegI dst, vecY src1, vecY src2) %{
23022   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::carrySet);
23023   match(Set dst (VectorTest src1 src2 ));
23024   format %{ "vptest  $src1,$src2\n\t"
23025             "setb  $dst\t!" %}
23026   ins_encode %{
23027     int vector_len = 1;
23028     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
23029     __ setb(Assembler::carrySet, $dst$$Register);
23030     __ movzbl($dst$$Register, $dst$$Register);
23031   %}
23032   ins_pipe( pipe_slow );
23033 %}
23034 
23035 instruct vptest8ieq(rRegI dst, vecY src1, vecY src2) %{
23036   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::notZero);
23037   match(Set dst (VectorTest src1 src2 ));
23038   format %{ "vptest  $src1,$src2\n\t"
23039             "setb  $dst\t!" %}
23040   ins_encode %{
23041     int vector_len = 1;
23042     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
23043     __ setb(Assembler::notZero, $dst$$Register);
23044     __ movzbl($dst$$Register, $dst$$Register);
23045   %}
23046   ins_pipe( pipe_slow );
23047 %}
23048 
23049 instruct loadmask8b(vecD dst, vecD src) %{
23050   predicate(UseSSE >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23051   match(Set dst (VectorLoadMask src));
23052   effect(TEMP dst);
23053   format %{ "pxor  $dst,$dst\n\t"
23054            "psubb $dst,$src\t! load mask (8B to 8B)" %}
23055   ins_encode %{
23056     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23057     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23058   %}
23059   ins_pipe( pipe_slow );
23060 %}
23061 
23062 instruct loadmask16b(vecX dst, vecX src) %{
23063   predicate(UseSSE >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23064   match(Set dst (VectorLoadMask src));
23065   effect(TEMP dst);
23066   format %{ "vpxor  $dst,$dst\n\t"
23067            "vpsubb $dst,$src\t! load mask (16B to 16B)" %}
23068   ins_encode %{
23069     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23070     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23071   %}
23072   ins_pipe( pipe_slow );
23073 %}
23074 
23075 instruct loadmask32b(vecY dst, vecY src) %{
23076   predicate(UseAVX >= 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23077   match(Set dst (VectorLoadMask src));
23078   effect(TEMP dst);
23079   format %{ "vpxor  $dst,$dst\n\t"
23080            "vpsubb $dst,$src\t! load mask (32B to 32B)" %}
23081   ins_encode %{
23082     int vector_len = 1;
23083     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23084     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
23085   %}
23086   ins_pipe( pipe_slow );
23087 %}
23088 
23089 instruct loadmask64b(vecZ dst, vecZ src) %{
23090   predicate(UseAVX > 0 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23091   match(Set dst (VectorLoadMask src));
23092   effect(TEMP dst);
23093   format %{ "vpxor  $dst,$dst\n\t"
23094            "vpsubb $dst,$src\t! load mask (64B to 64B)" %}
23095   ins_encode %{
23096     int vector_len = 2;
23097     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23098     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
23099   %}
23100   ins_pipe( pipe_slow );
23101 %}
23102 
23103 instruct loadmask4s(vecD dst, vecS src) %{
23104   predicate(UseSSE >= 4 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23105   match(Set dst (VectorLoadMask src));
23106   effect(TEMP dst);
23107   format %{ "pxor  $dst,$dst\n\t"
23108            "psubb $dst,$src\n\t"
23109            "pmovsxbw $dst\t! load mask (4B to 4S)" %}
23110   ins_encode %{
23111     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23112     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23113     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
23114   %}
23115   ins_pipe( pipe_slow );
23116 %}
23117 
23118 instruct loadmask8s(vecX dst, vecD src) %{
23119   predicate(UseSSE >= 4 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23120   match(Set dst (VectorLoadMask src));
23121   effect(TEMP dst);
23122   format %{ "pxor  $dst,$dst\n\t"
23123            "psubb $dst,$src\n\t"
23124            "pmovsxbw $dst\t! load mask (8B to 8S)" %}
23125   ins_encode %{
23126     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23127     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23128     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
23129   %}
23130   ins_pipe( pipe_slow );
23131 %}
23132 
23133 instruct loadmask16s(vecY dst, vecX src) %{
23134   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23135   match(Set dst (VectorLoadMask src));
23136   effect(TEMP dst);
23137   format %{ "vpxor  $dst,$dst\n\t"
23138            "vpsubb $dst,$src\n\t"
23139            "vpmovsxbw $dst\t! load mask (16B to 16S)" %}
23140   ins_encode %{
23141     int vector_len = 1;
23142     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
23143     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 0);
23144     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23145   %}
23146   ins_pipe( pipe_slow );
23147 %}
23148 
23149 instruct loadmask32s(vecZ dst, vecY src) %{
23150   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23151   match(Set dst (VectorLoadMask src));
23152   effect(TEMP dst);
23153   format %{ "vpxor  $dst,$dst\n\t"
23154            "vpsubb $dst,$src\n\t"
23155            "vpmovsxbw $dst\t! load mask (32B to 32S)" %}
23156   ins_encode %{
23157     int vector_len = 2;
23158     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 1);
23159     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 1);
23160     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23161   %}
23162   ins_pipe( pipe_slow );
23163 %}
23164 
23165 instruct loadmask2i(vecD dst, vecS src) %{
23166   predicate(UseSSE >= 4 && n->as_Vector()->length() == 2 &&
23167             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23168              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23169   match(Set dst (VectorLoadMask src));
23170   effect(TEMP dst);
23171   format %{ "pxor  $dst,$dst\n\t"
23172            "psubb $dst,$src\n\t"
23173            "pmovsxbd $dst\t! load mask (2B to 2I)" %}
23174   ins_encode %{
23175     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23176     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23177     __ pmovsxbd($dst$$XMMRegister, $dst$$XMMRegister);
23178   %}
23179   ins_pipe( pipe_slow );
23180 %}
23181 
23182 instruct loadmask4i(vecX dst, vecS src) %{
23183   predicate(UseSSE >= 4 && n->as_Vector()->length() == 4 &&
23184             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23185              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23186   match(Set dst (VectorLoadMask src));
23187   effect(TEMP dst);
23188   format %{ "pxor  $dst,$dst\n\t"
23189            "psubb $dst,$src\n\t"
23190            "pmovsxbd $dst\t! load mask (4B to 4I)" %}
23191   ins_encode %{
23192     int vector_len = 0;
23193     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23194     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23195     __ pmovsxbd($dst$$XMMRegister, $dst$$XMMRegister);
23196   %}
23197   ins_pipe( pipe_slow );
23198 %}
23199 
23200 instruct loadmask8i(vecY dst, vecD src) %{
23201   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 &&
23202             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23203              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23204   match(Set dst (VectorLoadMask src));
23205   effect(TEMP dst);
23206   format %{ "vpxor  $dst,$dst\n\t"
23207            "vpsubb $dst,$src\n\t"
23208            "vpmovsxbd $dst\t! load mask (8B to 8I)" %}
23209   ins_encode %{
23210     int vector_len = 1;
23211     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
23212     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 0);
23213     __ vpmovsxbd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23214   %}
23215   ins_pipe( pipe_slow );
23216 %}
23217 
23218 instruct loadmask16i(vecZ dst, vecX src, vecZ tmp) %{
23219   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
23220             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23221              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23222   match(Set dst (VectorLoadMask src));
23223   effect(TEMP dst, TEMP tmp);
23224   format %{ "vpxor  $dst,$dst\n\t"
23225            "vpmovzxbd $tmp,$src\n\t"
23226            "vpsubd $dst,$tmp\t! load mask (16B to 16I)" %}
23227   ins_encode %{
23228     int vector_len = 2;
23229     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23230     __ vpmovzxbd($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
23231     __ vpsubd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
23232   %}
23233   ins_pipe( pipe_slow );
23234 %}
23235 
23236 instruct loadmask1l(vecD dst, vecS src) %{
23237   predicate(UseSSE >= 4 && n->as_Vector()->length() == 1 &&
23238             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23239              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23240   match(Set dst (VectorLoadMask src));
23241   effect(TEMP dst);
23242   format %{ "pxor  $dst,$dst\n\t"
23243            "psubb $dst,$src\n\t"
23244            "pmovsxbq $dst\t! load mask (1B to 1L)" %}
23245   ins_encode %{
23246     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23247     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23248     __ pmovsxbq($dst$$XMMRegister, $dst$$XMMRegister);
23249   %}
23250   ins_pipe( pipe_slow );
23251 %}
23252 
23253 instruct loadmask2l(vecX dst, vecS src) %{
23254   predicate(UseSSE >= 4 && n->as_Vector()->length() == 2 &&
23255             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23256              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23257   match(Set dst (VectorLoadMask src));
23258   effect(TEMP dst);
23259   format %{ "pxor  $dst,$dst\n\t"
23260            "psubb $dst,$src\n\t"
23261            "pmovsxbq $dst\t! load mask (2B to 2L)" %}
23262   ins_encode %{
23263     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
23264     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
23265     __ pmovsxbq($dst$$XMMRegister, $dst$$XMMRegister);
23266   %}
23267   ins_pipe( pipe_slow );
23268 %}
23269 
23270 instruct loadmask4l(vecY dst, vecS src) %{
23271   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 &&
23272             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23273              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23274   match(Set dst (VectorLoadMask src));
23275   effect(TEMP dst);
23276   format %{ "vpxor  $dst,$dst\n\t"
23277            "vpsubb $dst,$src\n\t"
23278            "vpmovsxbq $dst\t! load mask (4B to 4L)" %}
23279   ins_encode %{
23280     int vector_len = 1;
23281     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
23282     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 0);
23283     __ vpmovsxbq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23284   %}
23285   ins_pipe( pipe_slow );
23286 %}
23287 
23288 instruct loadmask8l(vecZ dst, vecD src, vecZ tmp) %{
23289   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
23290             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23291              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23292   match(Set dst (VectorLoadMask src));
23293   effect(TEMP dst, TEMP tmp);
23294   format %{ "vpxor  $dst,$dst\n\t"
23295            "vpmovzxbq $tmp,$src\n\t"
23296            "vpsubq $dst,$tmp\t! load mask (8B to 8L)" %}
23297   ins_encode %{
23298     int vector_len = 2;
23299     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23300     __ vpmovzxbq($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
23301     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
23302   %}
23303   ins_pipe( pipe_slow );
23304 %}
23305 
23306 instruct storemask8b(vecD dst, vecD src) %{
23307   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
23308   match(Set dst (VectorStoreMask src));
23309   format %{ "vpabsb $dst,$src\t! store mask (8B to 8B)" %}
23310   ins_encode %{
23311     int vector_len = 0;
23312     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23313   %}
23314   ins_pipe( pipe_slow );
23315 %}
23316 
23317 instruct storemask16b(vecX dst, vecX src) %{
23318   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
23319   match(Set dst (VectorStoreMask src));
23320   format %{ "vpabsb $dst,$src\t! store mask (16B to 16B)" %}
23321   ins_encode %{
23322     int vector_len = 0;
23323     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23324   %}
23325   ins_pipe( pipe_slow );
23326 %}
23327 
23328 instruct storemask32b(vecY dst, vecY src) %{
23329   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
23330   match(Set dst (VectorStoreMask src));
23331   format %{ "vpabsb $dst,$src\t! store mask (32B to 32B)" %}
23332   ins_encode %{
23333     int vector_len = 1;
23334     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23335   %}
23336   ins_pipe( pipe_slow );
23337 %}
23338 
23339 instruct storemask64b(vecZ dst, vecZ src, rRegL scratch) %{
23340   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
23341   match(Set dst (VectorStoreMask src));
23342   effect(TEMP scratch);
23343   format %{ "vpcmpeqb k2,$src,0xFFFFFFFF\n\t"
23344            "vmovdqub $dst,k2,0x01010101\t! store mask (64B to 64B)" %}
23345   ins_encode %{
23346     int vector_len = 2;
23347     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
23348     Assembler::ComparisonPredicate cp = Assembler::eq;
23349     __ evpcmpb(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
23350     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, vector_len, $scratch$$Register);
23351   %}
23352   ins_pipe( pipe_slow );
23353 %}
23354 
23355 instruct storemask4s(vecS dst, vecD src) %{
23356   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
23357   match(Set dst (VectorStoreMask src));
23358   format %{ "vpabsw $dst,$src\n\t"
23359            "vpackuswb $dst,$dst,$dst\t! store mask (4S to 4B)" %}
23360   ins_encode %{
23361     int vector_len = 0;
23362     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23363     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23364   %}
23365   ins_pipe( pipe_slow );
23366 %}
23367 
23368 instruct storemask8s(vecD dst, vecX src) %{
23369   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
23370   match(Set dst (VectorStoreMask src));
23371   format %{ "vpabsw $dst,$src\n\t"
23372            "vpackuswb $dst,$dst,$dst\t! store mask (8S to 8B)" %}
23373   ins_encode %{
23374     int vector_len = 0;
23375     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23376     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23377   %}
23378   ins_pipe( pipe_slow );
23379 %}
23380 
23381 instruct storemask16s(vecX dst, vecY src, vecY tmp) %{
23382   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
23383   match(Set dst (VectorStoreMask src));
23384   effect(TEMP dst, TEMP tmp);
23385   format %{ "vpabsw $dst,$src\n\t"
23386            "vextracti128 $tmp,$dst\n\t"
23387            "vpackuswb $dst,$dst,$tmp\t! store mask (16S to 16B)" %}
23388   ins_encode %{
23389     int vector_len = 1;
23390     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23391     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
23392     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
23393   %}
23394   ins_pipe( pipe_slow );
23395 %}
23396 
23397 instruct storemask32s(vecY dst, vecZ src, rRegL scratch) %{
23398   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
23399   match(Set dst (VectorStoreMask src));
23400   effect(TEMP scratch);
23401   format %{ "vpcmpeqw k2,$src,0xFFFFFFFF\n\t"
23402            "vmovdqub $dst,k2,0x01010101\t! store mask (32S to 32B)" %}
23403   ins_encode %{
23404     int vector_len = 2;
23405     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
23406     Assembler::ComparisonPredicate cp = Assembler::eq;
23407     __ evpcmpw(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
23408     // The dst is 256-bit - thus we can do a smaller move.
23409     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 1, $scratch$$Register);
23410   %}
23411   ins_pipe( pipe_slow );
23412 %}
23413 
23414 
23415 instruct storemask2i(vecS dst, vecD src) %{
23416   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
23417   match(Set dst (VectorStoreMask src));
23418   format %{ "vpabsd $dst,$src\n\t"
23419            "vpackusdw $dst,$dst,$dst\n\t"
23420            "vpackuswb $dst,$dst,$dst\t! store mask (2I to 2B)" %}
23421   ins_encode %{
23422     int vector_len = 0;
23423     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23424     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23425     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23426   %}
23427   ins_pipe( pipe_slow );
23428 %}
23429 
23430 instruct storemask4i(vecS dst, vecX src) %{
23431   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
23432   match(Set dst (VectorStoreMask src));
23433   format %{ "vpabsd $dst,$src\n\t"
23434            "vpackusdw $dst,$dst,$dst\n\t"
23435            "vpackuswb $dst,$dst,$dst\t! store mask (4I to 4B)" %}
23436   ins_encode %{
23437     int vector_len = 0;
23438     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23439     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23440     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23441   %}
23442   ins_pipe( pipe_slow );
23443 %}
23444 
23445 instruct storemask8i(vecD dst, vecY src, vecY tmp) %{
23446   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
23447   match(Set dst (VectorStoreMask src));
23448   effect(TEMP dst, TEMP tmp);
23449   format %{ "vpxor  $dst,$dst\n\t"
23450            "vpsubd $dst,$src\n\t"
23451            "vextracti128 $tmp,$dst\n\t"
23452            "vpackusdw $dst,$dst,$tmp\n\t"
23453            "vpackuswb $dst,$dst,$dst\t! store mask (8I to 8B)" %}
23454   ins_encode %{
23455     int vector_len = 1;
23456     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23457     __ vpsubd($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
23458     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
23459     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
23460     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
23461   %}
23462   ins_pipe( pipe_slow );
23463 %}
23464 
23465 instruct storemask16i(vecX dst, vecZ src, rRegL scratch) %{
23466   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
23467   match(Set dst (VectorStoreMask src));
23468   effect(TEMP scratch);
23469   format %{ "vpcmpeqd k2,$src,0xFFFFFFFF\n\t"
23470            "vmovdqub $dst,k2,0x01010101\t! store mask (16I to 16B)" %}
23471   ins_encode %{
23472     int vector_len = 2;
23473     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
23474     __ evpcmpeqd(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
23475     // The dst is only 128-bit - thus we can do a smaller move.
23476     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register);
23477   %}
23478   ins_pipe( pipe_slow );
23479 %}
23480 
23481 instruct storemask1l(vecS dst, vecD src) %{
23482   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
23483   match(Set dst (VectorStoreMask src));
23484   format %{ "vpabsd $dst,$src\n\t"
23485            "vpackusdw $dst,$dst,$dst\n\t"
23486            "vpackuswb $dst,$dst,$dst\t! store mask (1L to 1B)" %}
23487   ins_encode %{
23488     int vector_len = 0;
23489     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23490     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23491     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23492   %}
23493   ins_pipe( pipe_slow );
23494 %}
23495 
23496 instruct storemask2l(vecS dst, vecX src) %{
23497   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
23498   match(Set dst (VectorStoreMask src));
23499   format %{ "vpshufd $dst,$src,0x8\n\t"
23500            "vpabsd $dst,$dst\n\t"
23501            "vpackusdw $dst,$dst,$dst\n\t"
23502            "vpackuswb $dst,$dst,$dst\t! store mask (2L to 2B)" %}
23503   ins_encode %{
23504     int vector_len = 0;
23505     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8, vector_len);
23506     __ vpabsd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23507     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23508     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23509   %}
23510   ins_pipe( pipe_slow );
23511 %}
23512 
23513 instruct storemask4l(vecS dst, vecY src, rRegL scratch) %{
23514   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
23515   match(Set dst (VectorStoreMask src));
23516   effect(TEMP scratch, TEMP dst);
23517   format %{ "vmovdqu $dst,[0,2,4,6,1,3,5,7]\n\t"
23518            "vpermd $dst,$dst,$src,"
23519            "vpabsd $dst,$dst\n\t"
23520            "vpackusdw $dst,$dst,$dst\n\t"
23521            "vpackuswb $dst,$dst,$dst\t! store mask (4L to 4B)" %}
23522   ins_encode %{
23523     // vpermd and load are 256-bit, but all others are 128-bit instructions.
23524     int vector_len = 0;
23525     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_long_perm_mask()), $scratch$$Register);
23526     __ vpermd($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister);
23527     __ vpabsd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23528     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23529     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23530   %}
23531   ins_pipe( pipe_slow );
23532 %}
23533 
23534 instruct storemask8l(vecD dst, vecZ src, rRegL scratch) %{
23535   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
23536   match(Set dst (VectorStoreMask src));
23537   effect(TEMP scratch);
23538   format %{ "vpcmpeqq k2,$src,0xFFFFFFFF\n\t"
23539            "vmovdqub $dst,k2,0x01010101\t! store mask (8L to 8B)" %}
23540   ins_encode %{
23541     int vector_len = 2;
23542     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
23543     Assembler::ComparisonPredicate cp = Assembler::eq;
23544     __ evpcmpq(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
23545     // The dst is only 128-bit - thus we can do a smaller move.
23546     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register);
23547   %}
23548   ins_pipe( pipe_slow );
23549 %}
23550 
23551 //-------------------------------- LOAD_SHUFFLE ----------------------------------
23552 
23553 instruct loadshuffle8b(vecD dst, vecD src) %{
23554   predicate(UseSSE > 1  && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23555   match(Set dst (VectorLoadShuffle src));
23556   format %{ "movdqu $dst, $src\t! load shuffle (load 8B for 8BRearrange)" %}
23557   ins_encode %{
23558      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
23559   %}
23560   ins_pipe( pipe_slow );
23561 %}
23562 
23563 instruct loadshuffle16b(vecX dst, vecX src) %{
23564   predicate(UseSSE > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23565   match(Set dst (VectorLoadShuffle src));
23566   format %{ "movdqu $dst, $src\t! load shuffle (load 16B for 16BRearrange)" %}
23567   ins_encode %{
23568     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
23569   %}
23570   ins_pipe( pipe_slow );
23571 %}
23572 
23573 instruct loadshuffle32b(vecY dst, vecY src) %{
23574   predicate(UseAVX > 0 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23575   match(Set dst (VectorLoadShuffle src));
23576   format %{ "vmovdqu $dst, $src\t! load shuffle (load 32B for 32BRearrange)" %}
23577   ins_encode %{
23578     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
23579   %}
23580   ins_pipe( pipe_slow );
23581 %}
23582 
23583 instruct loadshuffle64b(vecZ dst, vecZ src) %{
23584   predicate(UseAVX > 2  && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23585   match(Set dst (VectorLoadShuffle src));
23586   format %{ "vmovdqu $dst, $src\t! load shuffle (load 64B for 64BRearrange)" %}
23587   ins_encode %{
23588     __ evmovdqul($dst$$XMMRegister, $src$$XMMRegister, 2);
23589   %}
23590   ins_pipe( pipe_slow );
23591 %}
23592 
23593 instruct loadshuffle4s(vecD dst, vecS src, vecD tmp, vecD tmp2, rRegI scratch) %{
23594   predicate(UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23595   match(Set dst (VectorLoadShuffle src));
23596   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
23597    format %{ "pmovsxbw    $tmp, $src \n\t"
23598              "movdqu      $tmp2,0x0002000200020002\n\t"
23599              "pmullw      $tmp,$tmp2\n\t"
23600              "movdqu      $tmp2,$tmp\n\t"
23601              "psllw       $tmp2,0x8\n\t"
23602              "paddb       $tmp2,$tmp\n\t"
23603              "movdqu      $tmp, 0x0100010001000100 \n\t"
23604              "paddb       $tmp2,$tmp\n\t"
23605              "movdqu      $dst, $tmp2\t! load shuffle (load 4B for 4SRearrange)" %}
23606   ins_encode %{
23607     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
23608     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_sizemask()), $scratch$$Register);
23609     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
23610     __ movdqu($tmp2$$XMMRegister, $tmp$$XMMRegister);
23611     __ psllw($tmp2$$XMMRegister, 0x8);
23612     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23613     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
23614     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23615     __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
23616   %}
23617   ins_pipe( pipe_slow );
23618 %}
23619 
23620 instruct loadshuffle8s(vecX dst, vecD src, vecX tmp, vecX tmp2, rRegI scratch) %{
23621   predicate(UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23622   match(Set dst (VectorLoadShuffle src));
23623   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
23624   format %{  "pmovsxbw    $tmp, $src \n\t"
23625              "movdqu      $tmp2,0x0002000200020002\n\t"
23626              "pmullw      $tmp,$tmp2\n\t"
23627              "movdqu      $tmp2,$tmp\n\t"
23628              "psllw       $tmp2,0x8\n\t"
23629              "paddb       $tmp2,$tmp\n\t"
23630              "movdqu      $tmp, 0x0100010001000100 \n\t"
23631              "paddb       $tmp2,$tmp\n\t"
23632              "movdqu      $dst, $tmp2\t! load shuffle (load 8B for 8SRearrange)" %}
23633   ins_encode %{
23634     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
23635     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_sizemask()), $scratch$$Register);
23636     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
23637     __ movdqu($tmp2$$XMMRegister, $tmp$$XMMRegister);
23638     __ psllw($tmp2$$XMMRegister, 0x8);
23639     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23640     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
23641     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23642     __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
23643   %}
23644   ins_pipe( pipe_slow );
23645 %}
23646 
23647 instruct loadshuffle16s(vecY dst, vecX src) %{
23648   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23649   match(Set dst (VectorLoadShuffle src));
23650   format %{ "vpmovsxbw   $dst,$src\t! load shuffle (load 16B for 16SRearrange)" %}
23651   ins_encode %{
23652     int vector_len = 1;
23653     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23654   %}
23655   ins_pipe( pipe_slow );
23656 %}
23657 
23658 instruct loadshuffle32s(vecZ dst, vecY src) %{
23659   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23660   match(Set dst (VectorLoadShuffle src));
23661   format %{ "vpmovsxbw   $dst,$src\t! load shuffle (load 32B for 32SRearrange)" %}
23662   ins_encode %{
23663     int vector_len = 2;
23664     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23665   %}
23666   ins_pipe( pipe_slow );
23667 %}
23668 
23669 instruct loadshuffle4i(vecX dst, vecS src, vecX tmp, vecX tmp2, rRegI scratch) %{
23670   predicate(UseSSE > 3 && n->as_Vector()->length() == 4 &&
23671             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23672              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23673   match(Set dst (VectorLoadShuffle src));
23674   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
23675   format %{ "vpmovsxbd   $tmp, $src \n\t"
23676             "movdqu      $tmp2, 0x0000000400000004 \n\t"
23677             "pmulld      $tmp2, $tmp \n\t"
23678             "movdqu      $tmp,$tmp2\n\t"
23679             "pslld       $tmp2,0x8\n\t"
23680             "paddb       $tmp2,$tmp\n\t"
23681             "pslld       $tmp2,0x8\n\t"
23682             "paddb       $tmp2,$tmp\n\t"
23683             "pslld       $tmp2,0x8\n\t"
23684             "paddb       $tmp2,$tmp\n\t"
23685             "movdqu      $tmp, 0x0302010003020100 \n\t"
23686             "paddb       $tmp2,$tmp\n\t"
23687             "movdqu      $dst, $tmp2\t! load shuffle (load 4B for 4IRearrange)" %}
23688   ins_encode %{
23689     __ vpmovsxbd($tmp$$XMMRegister, $src$$XMMRegister, 0);
23690     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_int_sizemask()), $scratch$$Register);
23691     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
23692     __ movdqu($tmp$$XMMRegister, $tmp2$$XMMRegister);
23693     __ pslld($tmp2$$XMMRegister, 0x8);
23694     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23695     __ pslld($tmp2$$XMMRegister, 0x8);
23696     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23697     __ pslld($tmp2$$XMMRegister, 0x8);
23698     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23699     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
23700     __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
23701     __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
23702   %}
23703   ins_pipe( pipe_slow );
23704 %}
23705 
23706 instruct loadshuffle8i(vecY dst, vecD src) %{
23707   predicate(UseAVX >= 1 && n->as_Vector()->length() == 8 &&
23708             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23709              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23710   match(Set dst (VectorLoadShuffle src));
23711   format %{ "vpmovsxbd $dst, $src\t! load shuffle (load 8B for 8IRearrange)" %}
23712   ins_encode %{
23713   int vector_len = 1;
23714     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23715   %}
23716   ins_pipe( pipe_slow );
23717 %}
23718 
23719 instruct loadshuffle16i(vecZ dst, vecX src) %{
23720   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
23721             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23722              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23723   match(Set dst (VectorLoadShuffle src));
23724   format %{ "vpmovsxbd $dst, $src\t! load shuffle (load 16B for 16IRearrange)" %}
23725   ins_encode %{
23726   int vector_len = 2;
23727     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
23728   %}
23729   ins_pipe( pipe_slow );
23730 %}
23731 
23732 instruct loadshuffle4l(vecY dst, vecS src, vecY tmp, vecY tmp2, rRegI scratch) %{
23733   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
23734             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23735              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23736   match(Set dst (VectorLoadShuffle src));
23737   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
23738    format %{ "vpmovsxbd   $tmp2, $src \n\t"
23739              "movdqu     $tmp, 0x0000000200000002 \n\t"
23740              "pmulld     $tmp, $tmp2 \n\t"
23741              "vpmovsxdq  $tmp2,$tmp\n\t"
23742              "vpsllq     $tmp2,0x20\n\t"
23743              "vpaddd     $tmp2,$tmp\n\t"
23744              "vmovdqu    $tmp, 0x0000000100000000 \n\t"
23745              "vpaddd     $tmp2,$tmp\n\t"
23746              "vmovdqu    $dst, $tmp2\t! load shuffle (load 4L for 4LRearrange)" %}
23747   ins_encode %{
23748     int vector_len = 1;
23749     __ vpmovsxbd($tmp2$$XMMRegister, $src$$XMMRegister, 0);
23750     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sizemask()), $scratch$$Register);
23751     __ pmulld($tmp$$XMMRegister, $tmp2$$XMMRegister);
23752     __ vpmovsxdq($tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
23753     __ vpsllq($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x20, vector_len);
23754     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
23755     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_shufflemask()), $scratch$$Register);
23756     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
23757     __ vmovdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
23758   %}
23759   ins_pipe( pipe_slow );
23760 %}
23761 
23762 instruct loadshuffle8l(vecZ dst, vecD src, vecZ tmp, vecZ tmp2, rRegI scratch) %{
23763   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
23764             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23765              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23766   match(Set dst (VectorLoadShuffle src));
23767   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
23768   format %{ "vpmovsxbd  $tmp2, $src \n\t"
23769             "movdqu     $tmp, 0x0000000200000002 \n\t"
23770             "pmulld     $tmp, $tmp2\n\t"
23771             "vpmovsxdq  $tmp2,$tmp\n\t"
23772             "vpsllq     $tmp2,0x20\n\t"
23773             "vpaddd     $tmp2,$tmp\n\t"
23774             "vmovdqu    $tmp, 0x0000000100000000 \n\t"
23775             "vpaddd     $tmp2,$tmp\n\t"
23776             "vmovdqu    $dst, $tmp2\t! load shuffle (load 8L for 8LRearrange)" %}
23777   ins_encode %{
23778   int vector_len = 2;
23779     __ vpmovsxbd($tmp2$$XMMRegister, $src$$XMMRegister, 1);
23780     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sizemask()), $scratch$$Register);
23781     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 1);
23782     __ vpmovsxdq($tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
23783     __ vpsllq($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x20, vector_len);
23784     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
23785     __ evmovdqul($tmp$$XMMRegister, k1, ExternalAddress(vector_long_shufflemask()), false, vector_len, $scratch$$Register);
23786     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
23787     __ evmovdqul($dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
23788   %}
23789   ins_pipe( pipe_slow );
23790 %}
23791 //-------------------------------- Rearrange -------------------------------------
23792 
23793 instruct rearrange8b(vecD dst, vecD shuffle) %{
23794   predicate(UseSSE > 2 && n->as_Vector()->length() == 8 &&
23795             n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23796   match(Set dst (VectorRearrange dst shuffle));
23797   effect(TEMP dst);
23798   format %{ "pshufb $dst, $shuffle\t! rerrrange (8BRearrange)" %}
23799   ins_encode %{
23800     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
23801   %}
23802   ins_pipe( pipe_slow );
23803 %}
23804 
23805 instruct rearrange16b(vecX dst, vecX shuffle) %{
23806   predicate(UseSSE > 2 && n->as_Vector()->length() == 16 &&
23807             n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23808   match(Set dst (VectorRearrange dst shuffle));
23809   effect(TEMP dst);
23810   format %{ "pshufb $dst, $shuffle\t! rearrange (16BRearrange)" %}
23811   ins_encode %{
23812     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
23813   %}
23814   ins_pipe( pipe_slow );
23815 %}
23816 
23817 instruct rearrange32b(vecY dst, vecY src, vecY shuffle) %{
23818   predicate(UseAVX > 2 && VM_Version::supports_avx512vbmi() && n->as_Vector()->length() == 32 &&
23819             n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23820   match(Set dst (VectorRearrange src shuffle));
23821   effect(TEMP dst);
23822   format %{ "vpermb $dst, $shuffle\t! rearrange (32BRearrange)" %}
23823   ins_encode %{
23824     int vector_len = 1;
23825     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
23826   %}
23827   ins_pipe( pipe_slow );
23828 %}
23829 
23830 instruct rearrange64b(vecZ dst, vecZ src, vecZ shuffle) %{
23831   predicate(UseAVX > 2 && VM_Version::supports_avx512vbmi() && n->as_Vector()->length() == 64 &&
23832             n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23833   match(Set dst (VectorRearrange src shuffle));
23834   effect(TEMP dst);
23835   format %{ "vpermb $dst, $shuffle\t! rearrange (64BRearrange)" %}
23836   ins_encode %{
23837     int vector_len = 2;
23838     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
23839   %}
23840   ins_pipe( pipe_slow );
23841 %}
23842 
23843 instruct rearrange4s(vecD dst, vecD shuffle) %{
23844   predicate(UseSSE > 2 && n->as_Vector()->length() == 4 &&
23845             n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23846   match(Set dst (VectorRearrange dst shuffle));
23847   effect(TEMP dst);
23848   format %{ "pshufb $dst, $shuffle\t! rerrrange (4SRearrange)" %}
23849   ins_encode %{
23850     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
23851   %}
23852   ins_pipe( pipe_slow );
23853 %}
23854 
23855 instruct rearrange8s(vecX dst, vecX shuffle) %{
23856   predicate(UseSSE > 2 && n->as_Vector()->length() == 8 &&
23857             n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23858   match(Set dst (VectorRearrange dst shuffle));
23859   effect(TEMP dst);
23860   format %{ "pshufb $dst, $shuffle\t! rearrange (8SRearrange)" %}
23861   ins_encode %{
23862     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
23863   %}
23864   ins_pipe( pipe_slow );
23865 %}
23866 
23867 instruct rearrange16s(vecY dst, vecY src, vecY shuffle) %{
23868   predicate(UseAVX > 2 && VM_Version::supports_avx512vlbw() && n->as_Vector()->length() == 16 &&
23869             n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23870   match(Set dst (VectorRearrange src shuffle));
23871   effect(TEMP dst);
23872   format %{ "vpermw $dst, $shuffle\t! rearrange (16SRearrange)" %}
23873   ins_encode %{
23874     int vector_len = 1;
23875     __ vpermw($dst$$XMMRegister, k0, $shuffle$$XMMRegister, $src$$XMMRegister, false,vector_len);
23876   %}
23877   ins_pipe( pipe_slow );
23878 %}
23879 
23880 instruct rearrange32s(vecZ dst, vecZ src, vecZ shuffle) %{
23881   predicate(UseAVX > 2 && VM_Version::supports_avx512vlbw() && n->as_Vector()->length() == 32 &&
23882             n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
23883   match(Set dst (VectorRearrange src shuffle));
23884   effect(TEMP dst);
23885   format %{ "vpermw $dst, $shuffle\t! rearrange (32SRearrange)" %}
23886   ins_encode %{
23887     int vector_len = 2;
23888     __ __ vpermw($dst$$XMMRegister, k0, $shuffle$$XMMRegister, $src$$XMMRegister, false,vector_len);
23889   %}
23890   ins_pipe( pipe_slow );
23891 %}
23892 
23893 instruct rearrange4i(vecX dst, vecX shuffle) %{
23894  predicate(UseSSE > 2 && n->as_Vector()->length() == 4 &&
23895             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23896              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23897   match(Set dst (VectorRearrange dst shuffle));
23898   effect(TEMP dst);
23899   format %{ "pshufb $dst, $shuffle\t! rearrange (4IRearrange)" %}
23900   ins_encode %{
23901     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
23902   %}
23903   ins_pipe( pipe_slow );
23904 %}
23905 
23906 instruct rearrange8i(vecY dst, vecY src, vecY shuffle) %{
23907   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && 
23908             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23909              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23910   match(Set dst (VectorRearrange src shuffle));
23911   effect(TEMP dst);
23912   format %{ "vpermd $dst, $src, $shuffle\t! rearrange (8IRearrange)" %}
23913   ins_encode %{
23914     int vector_len = 1;
23915     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister);
23916   %}
23917   ins_pipe( pipe_slow );
23918 %}
23919 
23920 instruct rearrange16i(vecZ dst, vecZ src, vecZ shuffle) %{
23921   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && 
23922             (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
23923              n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
23924   match(Set dst (VectorRearrange src shuffle));
23925   effect(TEMP dst);
23926   format %{ "vpermd $dst, $src, $shuffle\t! rearrange (16IRearrange)" %}
23927   ins_encode %{
23928     int vector_len = 2;
23929     __ evpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
23930   %}
23931   ins_pipe( pipe_slow );
23932 %}
23933 
23934 instruct rearrange4l(vecY dst, vecY src, vecY shuffle) %{
23935   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
23936             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23937              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23938   match(Set dst (VectorRearrange src shuffle));
23939   effect(TEMP dst);
23940   format %{ "vpermd $dst, $src, $shuffle\t! rearrange (4LRearrange)" %}
23941   ins_encode %{
23942     int vector_len = 1;
23943     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister);
23944   %}
23945   ins_pipe( pipe_slow );
23946 %}
23947 
23948 instruct rearrange8l(vecZ dst, vecZ src, vecZ shuffle) %{
23949   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
23950             (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
23951              n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
23952   match(Set dst (VectorRearrange src shuffle));
23953   effect(TEMP dst);
23954   format %{ "vpermd $dst, $src, $shuffle\t! rearrange (8LRearrange)" %}
23955   ins_encode %{
23956     int vector_len = 2;
23957     __ evpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
23958   %}
23959   ins_pipe( pipe_slow );
23960 %}
23961 // --------------------------------- FMA --------------------------------------
23962 
23963 // a * b + c
23964 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
23965   predicate(UseFMA && n->as_Vector()->length() == 2);
23966   match(Set c (FmaVD  c (Binary a b)));
23967   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
23968   ins_cost(150);
23969   ins_encode %{
23970     int vector_len = 0;
23971     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
23972   %}
23973   ins_pipe( pipe_slow );
23974 %}
23975 
23976 // a * b + c
23977 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
23978   predicate(UseFMA && n->as_Vector()->length() == 2);
23979   match(Set c (FmaVD  c (Binary a (LoadVector b))));
23980   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
23981   ins_cost(150);
23982   ins_encode %{
23983     int vector_len = 0;
23984     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
23985   %}
23986   ins_pipe( pipe_slow );
23987 %}
23988 
23989 
23990 // a * b + c
23991 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
23992   predicate(UseFMA && n->as_Vector()->length() == 4);
23993   match(Set c (FmaVD  c (Binary a b)));
23994   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
23995   ins_cost(150);
23996   ins_encode %{
23997     int vector_len = 1;
23998     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
23999   %}
24000   ins_pipe( pipe_slow );
24001 %}
24002 
24003 // a * b + c
24004 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
24005   predicate(UseFMA && n->as_Vector()->length() == 4);
24006   match(Set c (FmaVD  c (Binary a (LoadVector b))));
24007   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
24008   ins_cost(150);
24009   ins_encode %{
24010     int vector_len = 1;
24011     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
24012   %}
24013   ins_pipe( pipe_slow );
24014 %}
24015 
24016 // a * b + c
24017 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
24018   predicate(UseFMA && n->as_Vector()->length() == 8);
24019   match(Set c (FmaVD  c (Binary a b)));
24020   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
24021   ins_cost(150);
24022   ins_encode %{
24023     int vector_len = 2;
24024     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
24025   %}
24026   ins_pipe( pipe_slow );
24027 %}
24028 
24029 // a * b + c
24030 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
24031   predicate(UseFMA && n->as_Vector()->length() == 8);
24032   match(Set c (FmaVD  c (Binary a (LoadVector b))));
24033   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
24034   ins_cost(150);
24035   ins_encode %{
24036     int vector_len = 2;
24037     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
24038   %}
24039   ins_pipe( pipe_slow );
24040 %}
24041 
24042 // a * b + c
24043 instruct vfma2F_reg(vecD a, vecD b, vecD c) %{
24044   predicate(UseFMA && n->as_Vector()->length() == 2);
24045   match(Set c (FmaVF  c (Binary a b)));
24046   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed2F" %}
24047   ins_cost(150);
24048   ins_encode %{
24049     int vector_len = 0;
24050     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
24051   %}
24052   ins_pipe( pipe_slow );
24053 %}
24054 
24055 // a * b + c
24056 instruct vfma2F_mem(vecD a, memory b, vecD c) %{
24057   predicate(UseFMA && n->as_Vector()->length() == 2);
24058   match(Set c (FmaVF  c (Binary a (LoadVector b))));
24059   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed2F" %}
24060   ins_cost(150);
24061   ins_encode %{
24062     int vector_len = 0;
24063     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
24064   %}
24065   ins_pipe( pipe_slow );
24066 %}
24067 
24068 // a * b + c
24069 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
24070   predicate(UseFMA && n->as_Vector()->length() == 4);
24071   match(Set c (FmaVF  c (Binary a b)));
24072   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
24073   ins_cost(150);
24074   ins_encode %{
24075     int vector_len = 0;
24076     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
24077   %}
24078   ins_pipe( pipe_slow );
24079 %}
24080 
24081 // a * b + c
24082 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
24083   predicate(UseFMA && n->as_Vector()->length() == 4);
24084   match(Set c (FmaVF  c (Binary a (LoadVector b))));
24085   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
24086   ins_cost(150);
24087   ins_encode %{
24088     int vector_len = 0;
24089     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
24090   %}
24091   ins_pipe( pipe_slow );
24092 %}
24093 
24094 // a * b + c
24095 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
24096   predicate(UseFMA && n->as_Vector()->length() == 8);
24097   match(Set c (FmaVF  c (Binary a b)));
24098   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
24099   ins_cost(150);
24100   ins_encode %{
24101     int vector_len = 1;
24102     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
24103   %}
24104   ins_pipe( pipe_slow );
24105 %}
24106 
24107 // a * b + c
24108 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
24109   predicate(UseFMA && n->as_Vector()->length() == 8);
24110   match(Set c (FmaVF  c (Binary a (LoadVector b))));
24111   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
24112   ins_cost(150);
24113   ins_encode %{
24114     int vector_len = 1;
24115     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
24116   %}
24117   ins_pipe( pipe_slow );
24118 %}
24119 
24120 // a * b + c
24121 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
24122   predicate(UseFMA && n->as_Vector()->length() == 16);
24123   match(Set c (FmaVF  c (Binary a b)));
24124   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
24125   ins_cost(150);
24126   ins_encode %{
24127     int vector_len = 2;
24128     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
24129   %}
24130   ins_pipe( pipe_slow );
24131 %}
24132 
24133 // a * b + c
24134 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
24135   predicate(UseFMA && n->as_Vector()->length() == 16);
24136   match(Set c (FmaVF  c (Binary a (LoadVector b))));
24137   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
24138   ins_cost(150);
24139   ins_encode %{
24140     int vector_len = 2;
24141     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
24142   %}
24143   ins_pipe( pipe_slow );
24144 %}
24145 
24146 // --------------------------------- PopCount --------------------------------------
24147 
24148 instruct vpopcount2I(vecD dst, vecD src) %{
24149   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
24150   match(Set dst (PopCountVI src));
24151   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
24152   ins_encode %{
24153     int vector_len = 0;
24154     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
24155   %}
24156   ins_pipe( pipe_slow );
24157 %}
24158 
24159 instruct vpopcount4I(vecX dst, vecX src) %{
24160   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
24161   match(Set dst (PopCountVI src));
24162   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
24163   ins_encode %{
24164     int vector_len = 0;
24165     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
24166   %}
24167   ins_pipe( pipe_slow );
24168 %}
24169 
24170 instruct vpopcount8I(vecY dst, vecY src) %{
24171   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
24172   match(Set dst (PopCountVI src));
24173   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
24174   ins_encode %{
24175     int vector_len = 1;
24176     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
24177   %}
24178   ins_pipe( pipe_slow );
24179 %}
24180 
24181 instruct vpopcount16I(vecZ dst, vecZ src) %{
24182   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
24183   match(Set dst (PopCountVI src));
24184   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
24185   ins_encode %{
24186     int vector_len = 2;
24187     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
24188   %}
24189   ins_pipe( pipe_slow );
24190 %}