1 //
   2 // Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1071 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1072 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1073 
1074 %}
1075 
1076 
1077 //----------SOURCE BLOCK-------------------------------------------------------
1078 // This is a block of C++ code which provides values, functions, and
1079 // definitions necessary in the rest of the architecture description
1080 
1081 source_hpp %{
1082 // Header information of the source block.
1083 // Method declarations/definitions which are used outside
1084 // the ad-scope can conveniently be defined here.
1085 //
1086 // To keep related declarations/definitions/uses close together,
1087 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1088 
1089 class NativeJump;
1090 
1091 class CallStubImpl {
1092 
1093   //--------------------------------------------------------------
1094   //---<  Used for optimization in Compile::shorten_branches  >---
1095   //--------------------------------------------------------------
1096 
1097  public:
1098   // Size of call trampoline stub.
1099   static uint size_call_trampoline() {
1100     return 0; // no call trampolines on this platform
1101   }
1102 
1103   // number of relocations needed by a call trampoline stub
1104   static uint reloc_call_trampoline() {
1105     return 0; // no call trampolines on this platform
1106   }
1107 };
1108 
1109 class HandlerImpl {
1110 
1111  public:
1112 
1113   static int emit_exception_handler(CodeBuffer &cbuf);
1114   static int emit_deopt_handler(CodeBuffer& cbuf);
1115 
1116   static uint size_exception_handler() {
1117     // NativeCall instruction size is the same as NativeJump.
1118     // exception handler starts out as jump and can be patched to
1119     // a call be deoptimization.  (4932387)
1120     // Note that this value is also credited (in output.cpp) to
1121     // the size of the code section.
1122     return NativeJump::instruction_size;
1123   }
1124 
1125 #ifdef _LP64
1126   static uint size_deopt_handler() {
1127     // three 5 byte instructions
1128     return 15;
1129   }
1130 #else
1131   static uint size_deopt_handler() {
1132     // NativeCall instruction size is the same as NativeJump.
1133     // exception handler starts out as jump and can be patched to
1134     // a call be deoptimization.  (4932387)
1135     // Note that this value is also credited (in output.cpp) to
1136     // the size of the code section.
1137     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1138   }
1139 #endif
1140 };
1141 
1142 %} // end source_hpp
1143 
1144 source %{
1145 
1146 #include "opto/addnode.hpp"
1147 
1148 // Emit exception handler code.
1149 // Stuff framesize into a register and call a VM stub routine.
1150 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1151 
1152   // Note that the code buffer's insts_mark is always relative to insts.
1153   // That's why we must use the macroassembler to generate a handler.
1154   MacroAssembler _masm(&cbuf);
1155   address base = __ start_a_stub(size_exception_handler());
1156   if (base == NULL) {
1157     ciEnv::current()->record_failure("CodeCache is full");
1158     return 0;  // CodeBuffer::expand failed
1159   }
1160   int offset = __ offset();
1161   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1162   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1163   __ end_a_stub();
1164   return offset;
1165 }
1166 
1167 // Emit deopt handler code.
1168 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1169 
1170   // Note that the code buffer's insts_mark is always relative to insts.
1171   // That's why we must use the macroassembler to generate a handler.
1172   MacroAssembler _masm(&cbuf);
1173   address base = __ start_a_stub(size_deopt_handler());
1174   if (base == NULL) {
1175     ciEnv::current()->record_failure("CodeCache is full");
1176     return 0;  // CodeBuffer::expand failed
1177   }
1178   int offset = __ offset();
1179 
1180 #ifdef _LP64
1181   address the_pc = (address) __ pc();
1182   Label next;
1183   // push a "the_pc" on the stack without destroying any registers
1184   // as they all may be live.
1185 
1186   // push address of "next"
1187   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1188   __ bind(next);
1189   // adjust it so it matches "the_pc"
1190   __ subptr(Address(rsp, 0), __ offset() - offset);
1191 #else
1192   InternalAddress here(__ pc());
1193   __ pushptr(here.addr());
1194 #endif
1195 
1196   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1197   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1198   __ end_a_stub();
1199   return offset;
1200 }
1201 
1202 
1203 //=============================================================================
1204 
1205   // Float masks come from different places depending on platform.
1206 #ifdef _LP64
1207   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1208   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1209   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1210   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1211   static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1212   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1213   static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1214   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1215   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1216   static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); }
1217   static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); }
1218   static address vector_byte_saturationmask() { return StubRoutines::x86::vector_byte_saturation_mask(); }
1219 #else
1220   static address float_signmask()  { return (address)float_signmask_pool; }
1221   static address float_signflip()  { return (address)float_signflip_pool; }
1222   static address double_signmask() { return (address)double_signmask_pool; }
1223   static address double_signflip() { return (address)double_signflip_pool; }
1224 #endif
1225 
1226 
1227 const bool Matcher::match_rule_supported(int opcode) {
1228   if (!has_match_rule(opcode))
1229     return false;
1230 
1231   bool ret_value = true;
1232   switch (opcode) {
1233     case Op_PopCountI:
1234     case Op_PopCountL:
1235       if (!UsePopCountInstruction)
1236         ret_value = false;
1237       break;
1238     case Op_MulVB:
1239     case Op_MulVI:
1240     case Op_MulVL:
1241       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1242         ret_value = false;
1243       break;
1244     case Op_MulReductionVL:
1245       if (VM_Version::supports_avx512dq() == false)
1246         ret_value = false;
1247       break;
1248     case Op_AddReductionVL:
1249       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1250         ret_value = false;
1251       break;
1252     case Op_AddReductionVI:
1253       if (UseSSE < 3) // requires at least SSE3
1254         ret_value = false;
1255       break;
1256     case Op_MulReductionVI:
1257       if (UseSSE < 4) // requires at least SSE4
1258         ret_value = false;
1259       break;
1260     case Op_AddReductionVF:
1261     case Op_AddReductionVD:
1262     case Op_MulReductionVF:
1263     case Op_MulReductionVD:
1264       if (UseSSE < 1) // requires at least SSE
1265         ret_value = false;
1266       break;
1267     case Op_SqrtVD:
1268     case Op_SqrtVF:
1269       if (UseAVX < 1) // enabled for AVX only
1270         ret_value = false;
1271       break;
1272     case Op_CompareAndSwapL:
1273 #ifdef _LP64
1274     case Op_CompareAndSwapP:
1275 #endif
1276       if (!VM_Version::supports_cx8())
1277         ret_value = false;
1278       break;
1279     case Op_CMoveVF:
1280     case Op_CMoveVD:
1281       if (UseAVX < 1 || UseAVX > 2)
1282         ret_value = false;
1283       break;
1284     case Op_StrIndexOf:
1285       if (!UseSSE42Intrinsics)
1286         ret_value = false;
1287       break;
1288     case Op_StrIndexOfChar:
1289       if (!UseSSE42Intrinsics)
1290         ret_value = false;
1291       break;
1292     case Op_OnSpinWait:
1293       if (VM_Version::supports_on_spin_wait() == false)
1294         ret_value = false;
1295       break;
1296   }
1297 
1298   return ret_value;  // Per default match rules are supported.
1299 }
1300 
1301 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1302   // identify extra cases that we might want to provide match rules for
1303   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1304   bool ret_value = match_rule_supported(opcode);
1305   if (ret_value) {
1306     int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1307     if (!vector_size_supported(bt, vlen)) {
1308       ret_value = false;
1309     } else if (size_in_bits > 256 && UseAVX <= 2) {
1310       // Only AVX512 supports 512-bit vectors
1311       ret_value = false;
1312     } else if (UseAVX == 0 && size_in_bits > 128) {
1313       // Only AVX supports 256-bit vectors
1314       ret_value = false;
1315     } else if (is_subword_type(bt) && size_in_bits == 512 && VM_Version::supports_avx512bw() == false) {
1316       // Byte and Short types are not supported in AVX512 if AVX512BW is not true.
1317       ret_value = false;
1318     } else {
1319         switch (opcode) {
1320         case Op_AddVB:
1321         case Op_SubVB:
1322           if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1323             ret_value = false;
1324           break;
1325         case Op_URShiftVS:
1326         case Op_RShiftVS:
1327         case Op_LShiftVS:
1328         case Op_MulVS:
1329         case Op_AddVS:
1330         case Op_SubVS:
1331           if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1332             ret_value = false;
1333           break;
1334         case Op_CMoveVF:
1335           if (vlen != 8)
1336             ret_value  = false;
1337           break;
1338         case Op_CMoveVD:
1339           if (vlen != 4)
1340             ret_value  = false;
1341           break;
1342         case Op_VectorMaskCmp:
1343           if (UseAVX <= 0) { ret_value = false; }
1344           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1345           break;
1346         case Op_VectorBlend:
1347           if (UseSSE <= 3 && UseAVX == 0) { ret_value = false; }
1348           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1349           break;
1350         case Op_VectorTest:
1351           if (UseAVX <= 0) { ret_value = false; }
1352           else if (size_in_bits != 128 && size_in_bits != 256) { ret_value = false; } // Implementation limitation
1353           break;
1354         case Op_VectorLoadMask:
1355           if (UseSSE <= 3) { ret_value = false; }
1356           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1357           break;
1358         case Op_VectorStoreMask:
1359           if (UseAVX < 2) { ret_value = false; } // Implementation limitation
1360           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1361           else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation
1362           break;
1363         default:
1364           break;
1365       }
1366     }
1367   }
1368   if (ret_value) {
1369     assert(is_java_primitive(bt) && (vlen > 0) && is_power_of_2(vlen) &&
1370            vector_size_supported(bt, vlen), "must be supported");
1371   }
1372 
1373   return ret_value;  // Per default match rules are supported.
1374 }
1375 
1376 const bool Matcher::has_predicated_vectors(void) {
1377   bool ret_value = false;
1378   if (UseAVX > 2) {
1379     ret_value = VM_Version::supports_avx512vl();
1380   }
1381 
1382   return ret_value;
1383 }
1384 
1385 const int Matcher::float_pressure(int default_pressure_threshold) {
1386   int float_pressure_threshold = default_pressure_threshold;
1387 #ifdef _LP64
1388   if (UseAVX > 2) {
1389     // Increase pressure threshold on machines with AVX3 which have
1390     // 2x more XMM registers.
1391     float_pressure_threshold = default_pressure_threshold * 2;
1392   }
1393 #endif
1394   return float_pressure_threshold;
1395 }
1396 
1397 // Max vector size in bytes. 0 if not supported.
1398 const int Matcher::vector_width_in_bytes(BasicType bt) {
1399   assert(is_java_primitive(bt), "only primitive type vectors");
1400   if (UseSSE < 2) return 0;
1401   // SSE2 supports 128bit vectors for all types.
1402   // AVX2 supports 256bit vectors for all types.
1403   // AVX2/EVEX supports 512bit vectors for all types.
1404   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1405   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1406   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1407     size = (UseAVX > 2) ? 64 : 32;
1408   // Use flag to limit vector size.
1409   size = MIN2(size,(int)MaxVectorSize);
1410   // Minimum 2 values in vector (or 4 for bytes).
1411   switch (bt) {
1412   case T_DOUBLE:
1413   case T_LONG:
1414     if (size < 16) return 0;
1415     break;
1416   case T_FLOAT:
1417   case T_INT:
1418     if (size < 8) return 0;
1419     break;
1420   case T_BOOLEAN:
1421     if (size < 4) return 0;
1422     break;
1423   case T_CHAR:
1424     if (size < 4) return 0;
1425     break;
1426   case T_BYTE:
1427     if (size < 4) return 0;
1428     break;
1429   case T_SHORT:
1430     if (size < 4) return 0;
1431     break;
1432   default:
1433     ShouldNotReachHere();
1434   }
1435   return size;
1436 }
1437 
1438 // Limits on vector size (number of elements) loaded into vector.
1439 const int Matcher::max_vector_size(const BasicType bt) {
1440   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1441 }
1442 const int Matcher::min_vector_size(const BasicType bt) {
1443   int max_size = max_vector_size(bt);
1444   // Min size which can be loaded into vector is 4 bytes.
1445   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1446   return MIN2(size,max_size);
1447 }
1448 
1449 // Vector ideal reg corresponding to specidied size in bytes
1450 const uint Matcher::vector_ideal_reg(int size) {
1451   assert(MaxVectorSize >= size, "");
1452   switch(size) {
1453     case  4: return Op_VecS;
1454     case  8: return Op_VecD;
1455     case 16: return Op_VecX;
1456     case 32: return Op_VecY;
1457     case 64: return Op_VecZ;
1458   }
1459   ShouldNotReachHere();
1460   return 0;
1461 }
1462 
1463 // Only lowest bits of xmm reg are used for vector shift count.
1464 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1465   return Op_VecS;
1466 }
1467 
1468 // x86 supports misaligned vectors store/load.
1469 const bool Matcher::misaligned_vectors_ok() {
1470   return !AlignVector; // can be changed by flag
1471 }
1472 
1473 // x86 AES instructions are compatible with SunJCE expanded
1474 // keys, hence we do not need to pass the original key to stubs
1475 const bool Matcher::pass_original_key_for_aes() {
1476   return false;
1477 }
1478 
1479 
1480 const bool Matcher::convi2l_type_required = true;
1481 
1482 // Check for shift by small constant as well
1483 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1484   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1485       shift->in(2)->get_int() <= 3 &&
1486       // Are there other uses besides address expressions?
1487       !matcher->is_visited(shift)) {
1488     address_visited.set(shift->_idx); // Flag as address_visited
1489     mstack.push(shift->in(2), Matcher::Visit);
1490     Node *conv = shift->in(1);
1491 #ifdef _LP64
1492     // Allow Matcher to match the rule which bypass
1493     // ConvI2L operation for an array index on LP64
1494     // if the index value is positive.
1495     if (conv->Opcode() == Op_ConvI2L &&
1496         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1497         // Are there other uses besides address expressions?
1498         !matcher->is_visited(conv)) {
1499       address_visited.set(conv->_idx); // Flag as address_visited
1500       mstack.push(conv->in(1), Matcher::Pre_Visit);
1501     } else
1502 #endif
1503       mstack.push(conv, Matcher::Pre_Visit);
1504     return true;
1505   }
1506   return false;
1507 }
1508 
1509 // Should the Matcher clone shifts on addressing modes, expecting them
1510 // to be subsumed into complex addressing expressions or compute them
1511 // into registers?
1512 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1513   Node *off = m->in(AddPNode::Offset);
1514   if (off->is_Con()) {
1515     address_visited.test_set(m->_idx); // Flag as address_visited
1516     Node *adr = m->in(AddPNode::Address);
1517 
1518     // Intel can handle 2 adds in addressing mode
1519     // AtomicAdd is not an addressing expression.
1520     // Cheap to find it by looking for screwy base.
1521     if (adr->is_AddP() &&
1522         !adr->in(AddPNode::Base)->is_top() &&
1523         // Are there other uses besides address expressions?
1524         !is_visited(adr)) {
1525       address_visited.set(adr->_idx); // Flag as address_visited
1526       Node *shift = adr->in(AddPNode::Offset);
1527       if (!clone_shift(shift, this, mstack, address_visited)) {
1528         mstack.push(shift, Pre_Visit);
1529       }
1530       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1531       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1532     } else {
1533       mstack.push(adr, Pre_Visit);
1534     }
1535 
1536     // Clone X+offset as it also folds into most addressing expressions
1537     mstack.push(off, Visit);
1538     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1539     return true;
1540   } else if (clone_shift(off, this, mstack, address_visited)) {
1541     address_visited.test_set(m->_idx); // Flag as address_visited
1542     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1543     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1544     return true;
1545   }
1546   return false;
1547 }
1548 
1549 void Compile::reshape_address(AddPNode* addp) {
1550 }
1551 
1552 // Helper methods for MachSpillCopyNode::implementation().
1553 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1554                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1555   // In 64-bit VM size calculation is very complex. Emitting instructions
1556   // into scratch buffer is used to get size in 64-bit VM.
1557   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1558   assert(ireg == Op_VecS || // 32bit vector
1559          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1560          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1561          "no non-adjacent vector moves" );
1562   if (cbuf) {
1563     MacroAssembler _masm(cbuf);
1564     int offset = __ offset();
1565     switch (ireg) {
1566     case Op_VecS: // copy whole register
1567     case Op_VecD:
1568     case Op_VecX:
1569       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1570       break;
1571     case Op_VecY:
1572       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1573       break;
1574     case Op_VecZ:
1575       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1576       break;
1577     default:
1578       ShouldNotReachHere();
1579     }
1580     int size = __ offset() - offset;
1581 #ifdef ASSERT
1582     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1583     assert(!do_size || size == 4, "incorrect size calculattion");
1584 #endif
1585     return size;
1586 #ifndef PRODUCT
1587   } else if (!do_size) {
1588     switch (ireg) {
1589     case Op_VecS:
1590     case Op_VecD:
1591     case Op_VecX:
1592       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1593       break;
1594     case Op_VecY:
1595     case Op_VecZ:
1596       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1597       break;
1598     default:
1599       ShouldNotReachHere();
1600     }
1601 #endif
1602   }
1603   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1604   return (UseAVX > 2) ? 6 : 4;
1605 }
1606 
1607 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1608                             int stack_offset, int reg, uint ireg, outputStream* st) {
1609   // In 64-bit VM size calculation is very complex. Emitting instructions
1610   // into scratch buffer is used to get size in 64-bit VM.
1611   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1612   if (cbuf) {
1613     MacroAssembler _masm(cbuf);
1614     int offset = __ offset();
1615     if (is_load) {
1616       switch (ireg) {
1617       case Op_VecS:
1618         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1619         break;
1620       case Op_VecD:
1621         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1622         break;
1623       case Op_VecX:
1624         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1625         break;
1626       case Op_VecY:
1627         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1628         break;
1629       case Op_VecZ:
1630         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1631         break;
1632       default:
1633         ShouldNotReachHere();
1634       }
1635     } else { // store
1636       switch (ireg) {
1637       case Op_VecS:
1638         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1639         break;
1640       case Op_VecD:
1641         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1642         break;
1643       case Op_VecX:
1644         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1645         break;
1646       case Op_VecY:
1647         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1648         break;
1649       case Op_VecZ:
1650         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1651         break;
1652       default:
1653         ShouldNotReachHere();
1654       }
1655     }
1656     int size = __ offset() - offset;
1657 #ifdef ASSERT
1658     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1659     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1660     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1661 #endif
1662     return size;
1663 #ifndef PRODUCT
1664   } else if (!do_size) {
1665     if (is_load) {
1666       switch (ireg) {
1667       case Op_VecS:
1668         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1669         break;
1670       case Op_VecD:
1671         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1672         break;
1673        case Op_VecX:
1674         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1675         break;
1676       case Op_VecY:
1677       case Op_VecZ:
1678         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1679         break;
1680       default:
1681         ShouldNotReachHere();
1682       }
1683     } else { // store
1684       switch (ireg) {
1685       case Op_VecS:
1686         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1687         break;
1688       case Op_VecD:
1689         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1690         break;
1691        case Op_VecX:
1692         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1693         break;
1694       case Op_VecY:
1695       case Op_VecZ:
1696         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1697         break;
1698       default:
1699         ShouldNotReachHere();
1700       }
1701     }
1702 #endif
1703   }
1704   bool is_single_byte = false;
1705   int vec_len = 0;
1706   if ((UseAVX > 2) && (stack_offset != 0)) {
1707     int tuple_type = Assembler::EVEX_FVM;
1708     int input_size = Assembler::EVEX_32bit;
1709     switch (ireg) {
1710     case Op_VecS:
1711       tuple_type = Assembler::EVEX_T1S;
1712       break;
1713     case Op_VecD:
1714       tuple_type = Assembler::EVEX_T1S;
1715       input_size = Assembler::EVEX_64bit;
1716       break;
1717     case Op_VecX:
1718       break;
1719     case Op_VecY:
1720       vec_len = 1;
1721       break;
1722     case Op_VecZ:
1723       vec_len = 2;
1724       break;
1725     }
1726     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1727   }
1728   int offset_size = 0;
1729   int size = 5;
1730   if (UseAVX > 2 ) {
1731     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1732       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1733       size += 2; // Need an additional two bytes for EVEX encoding
1734     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1735       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1736     } else {
1737       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1738       size += 2; // Need an additional two bytes for EVEX encodding
1739     }
1740   } else {
1741     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1742   }
1743   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1744   return size+offset_size;
1745 }
1746 
1747 static inline jint replicate4_imm(int con, int width) {
1748   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1749   assert(width == 1 || width == 2, "only byte or short types here");
1750   int bit_width = width * 8;
1751   jint val = con;
1752   val &= (1 << bit_width) - 1;  // mask off sign bits
1753   while(bit_width < 32) {
1754     val |= (val << bit_width);
1755     bit_width <<= 1;
1756   }
1757   return val;
1758 }
1759 
1760 static inline jlong replicate8_imm(int con, int width) {
1761   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1762   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1763   int bit_width = width * 8;
1764   jlong val = con;
1765   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1766   while(bit_width < 64) {
1767     val |= (val << bit_width);
1768     bit_width <<= 1;
1769   }
1770   return val;
1771 }
1772 
1773 #ifndef PRODUCT
1774   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1775     st->print("nop \t# %d bytes pad for loops and calls", _count);
1776   }
1777 #endif
1778 
1779   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1780     MacroAssembler _masm(&cbuf);
1781     __ nop(_count);
1782   }
1783 
1784   uint MachNopNode::size(PhaseRegAlloc*) const {
1785     return _count;
1786   }
1787 
1788 #ifndef PRODUCT
1789   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1790     st->print("# breakpoint");
1791   }
1792 #endif
1793 
1794   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1795     MacroAssembler _masm(&cbuf);
1796     __ int3();
1797   }
1798 
1799   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1800     return MachNode::size(ra_);
1801   }
1802 
1803 %}
1804 
1805 encode %{
1806 
1807   enc_class call_epilog %{
1808     if (VerifyStackAtCalls) {
1809       // Check that stack depth is unchanged: find majik cookie on stack
1810       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1811       MacroAssembler _masm(&cbuf);
1812       Label L;
1813       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1814       __ jccb(Assembler::equal, L);
1815       // Die if stack mismatch
1816       __ int3();
1817       __ bind(L);
1818     }
1819   %}
1820 
1821 %}
1822 
1823 
1824 //----------OPERANDS-----------------------------------------------------------
1825 // Operand definitions must precede instruction definitions for correct parsing
1826 // in the ADLC because operands constitute user defined types which are used in
1827 // instruction definitions.
1828 
1829 // This one generically applies only for evex, so only one version
1830 operand vecZ() %{
1831   constraint(ALLOC_IN_RC(vectorz_reg));
1832   match(VecZ);
1833 
1834   format %{ %}
1835   interface(REG_INTER);
1836 %}
1837 
1838 operand rxmm0() %{
1839   constraint(ALLOC_IN_RC(xmm0_reg));  match(VecX);
1840   predicate((UseSSE > 0) && (UseAVX == 0));  format%{%}  interface(REG_INTER);
1841 %}
1842 
1843 // Comparison Code for FP conditional move
1844 operand cmpOp_vcmppd() %{
1845   match(Bool);
1846 
1847   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
1848             n->as_Bool()->_test._test != BoolTest::no_overflow);
1849   format %{ "" %}
1850   interface(COND_INTER) %{
1851     equal        (0x0, "eq");
1852     less         (0x1, "lt");
1853     less_equal   (0x2, "le");
1854     not_equal    (0xC, "ne");
1855     greater_equal(0xD, "ge");
1856     greater      (0xE, "gt");
1857     //TODO cannot compile (adlc breaks) without two next lines with error:
1858     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
1859     // equal' for overflow.
1860     overflow     (0x20, "o");  // not really supported by the instruction
1861     no_overflow  (0x21, "no"); // not really supported by the instruction
1862   %}
1863 %}
1864 
1865 
1866 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1867 
1868 // ============================================================================
1869 
1870 instruct ShouldNotReachHere() %{
1871   match(Halt);
1872   format %{ "ud2\t# ShouldNotReachHere" %}
1873   ins_encode %{
1874     __ ud2();
1875   %}
1876   ins_pipe(pipe_slow);
1877 %}
1878 
1879 // =================================EVEX special===============================
1880 
1881 instruct setMask(rRegI dst, rRegI src) %{
1882   predicate(Matcher::has_predicated_vectors());
1883   match(Set dst (SetVectMaskI  src));
1884   effect(TEMP dst);
1885   format %{ "setvectmask   $dst, $src" %}
1886   ins_encode %{
1887     __ setvectmask($dst$$Register, $src$$Register);
1888   %}
1889   ins_pipe(pipe_slow);
1890 %}
1891 
1892 // ============================================================================
1893 
1894 instruct addF_reg(regF dst, regF src) %{
1895   predicate((UseSSE>=1) && (UseAVX == 0));
1896   match(Set dst (AddF dst src));
1897 
1898   format %{ "addss   $dst, $src" %}
1899   ins_cost(150);
1900   ins_encode %{
1901     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1902   %}
1903   ins_pipe(pipe_slow);
1904 %}
1905 
1906 instruct addF_mem(regF dst, memory src) %{
1907   predicate((UseSSE>=1) && (UseAVX == 0));
1908   match(Set dst (AddF dst (LoadF src)));
1909 
1910   format %{ "addss   $dst, $src" %}
1911   ins_cost(150);
1912   ins_encode %{
1913     __ addss($dst$$XMMRegister, $src$$Address);
1914   %}
1915   ins_pipe(pipe_slow);
1916 %}
1917 
1918 instruct addF_imm(regF dst, immF con) %{
1919   predicate((UseSSE>=1) && (UseAVX == 0));
1920   match(Set dst (AddF dst con));
1921   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1922   ins_cost(150);
1923   ins_encode %{
1924     __ addss($dst$$XMMRegister, $constantaddress($con));
1925   %}
1926   ins_pipe(pipe_slow);
1927 %}
1928 
1929 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1930   predicate(UseAVX > 0);
1931   match(Set dst (AddF src1 src2));
1932 
1933   format %{ "vaddss  $dst, $src1, $src2" %}
1934   ins_cost(150);
1935   ins_encode %{
1936     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1937   %}
1938   ins_pipe(pipe_slow);
1939 %}
1940 
1941 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1942   predicate(UseAVX > 0);
1943   match(Set dst (AddF src1 (LoadF src2)));
1944 
1945   format %{ "vaddss  $dst, $src1, $src2" %}
1946   ins_cost(150);
1947   ins_encode %{
1948     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1949   %}
1950   ins_pipe(pipe_slow);
1951 %}
1952 
1953 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1954   predicate(UseAVX > 0);
1955   match(Set dst (AddF src con));
1956 
1957   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1958   ins_cost(150);
1959   ins_encode %{
1960     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1961   %}
1962   ins_pipe(pipe_slow);
1963 %}
1964 
1965 instruct addD_reg(regD dst, regD src) %{
1966   predicate((UseSSE>=2) && (UseAVX == 0));
1967   match(Set dst (AddD dst src));
1968 
1969   format %{ "addsd   $dst, $src" %}
1970   ins_cost(150);
1971   ins_encode %{
1972     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1973   %}
1974   ins_pipe(pipe_slow);
1975 %}
1976 
1977 instruct addD_mem(regD dst, memory src) %{
1978   predicate((UseSSE>=2) && (UseAVX == 0));
1979   match(Set dst (AddD dst (LoadD src)));
1980 
1981   format %{ "addsd   $dst, $src" %}
1982   ins_cost(150);
1983   ins_encode %{
1984     __ addsd($dst$$XMMRegister, $src$$Address);
1985   %}
1986   ins_pipe(pipe_slow);
1987 %}
1988 
1989 instruct addD_imm(regD dst, immD con) %{
1990   predicate((UseSSE>=2) && (UseAVX == 0));
1991   match(Set dst (AddD dst con));
1992   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1993   ins_cost(150);
1994   ins_encode %{
1995     __ addsd($dst$$XMMRegister, $constantaddress($con));
1996   %}
1997   ins_pipe(pipe_slow);
1998 %}
1999 
2000 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2001   predicate(UseAVX > 0);
2002   match(Set dst (AddD src1 src2));
2003 
2004   format %{ "vaddsd  $dst, $src1, $src2" %}
2005   ins_cost(150);
2006   ins_encode %{
2007     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2008   %}
2009   ins_pipe(pipe_slow);
2010 %}
2011 
2012 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2013   predicate(UseAVX > 0);
2014   match(Set dst (AddD src1 (LoadD src2)));
2015 
2016   format %{ "vaddsd  $dst, $src1, $src2" %}
2017   ins_cost(150);
2018   ins_encode %{
2019     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2020   %}
2021   ins_pipe(pipe_slow);
2022 %}
2023 
2024 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2025   predicate(UseAVX > 0);
2026   match(Set dst (AddD src con));
2027 
2028   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2029   ins_cost(150);
2030   ins_encode %{
2031     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2032   %}
2033   ins_pipe(pipe_slow);
2034 %}
2035 
2036 instruct subF_reg(regF dst, regF src) %{
2037   predicate((UseSSE>=1) && (UseAVX == 0));
2038   match(Set dst (SubF dst src));
2039 
2040   format %{ "subss   $dst, $src" %}
2041   ins_cost(150);
2042   ins_encode %{
2043     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2044   %}
2045   ins_pipe(pipe_slow);
2046 %}
2047 
2048 instruct subF_mem(regF dst, memory src) %{
2049   predicate((UseSSE>=1) && (UseAVX == 0));
2050   match(Set dst (SubF dst (LoadF src)));
2051 
2052   format %{ "subss   $dst, $src" %}
2053   ins_cost(150);
2054   ins_encode %{
2055     __ subss($dst$$XMMRegister, $src$$Address);
2056   %}
2057   ins_pipe(pipe_slow);
2058 %}
2059 
2060 instruct subF_imm(regF dst, immF con) %{
2061   predicate((UseSSE>=1) && (UseAVX == 0));
2062   match(Set dst (SubF dst con));
2063   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2064   ins_cost(150);
2065   ins_encode %{
2066     __ subss($dst$$XMMRegister, $constantaddress($con));
2067   %}
2068   ins_pipe(pipe_slow);
2069 %}
2070 
2071 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2072   predicate(UseAVX > 0);
2073   match(Set dst (SubF src1 src2));
2074 
2075   format %{ "vsubss  $dst, $src1, $src2" %}
2076   ins_cost(150);
2077   ins_encode %{
2078     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2079   %}
2080   ins_pipe(pipe_slow);
2081 %}
2082 
2083 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2084   predicate(UseAVX > 0);
2085   match(Set dst (SubF src1 (LoadF src2)));
2086 
2087   format %{ "vsubss  $dst, $src1, $src2" %}
2088   ins_cost(150);
2089   ins_encode %{
2090     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2091   %}
2092   ins_pipe(pipe_slow);
2093 %}
2094 
2095 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2096   predicate(UseAVX > 0);
2097   match(Set dst (SubF src con));
2098 
2099   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2100   ins_cost(150);
2101   ins_encode %{
2102     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2103   %}
2104   ins_pipe(pipe_slow);
2105 %}
2106 
2107 instruct subD_reg(regD dst, regD src) %{
2108   predicate((UseSSE>=2) && (UseAVX == 0));
2109   match(Set dst (SubD dst src));
2110 
2111   format %{ "subsd   $dst, $src" %}
2112   ins_cost(150);
2113   ins_encode %{
2114     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2115   %}
2116   ins_pipe(pipe_slow);
2117 %}
2118 
2119 instruct subD_mem(regD dst, memory src) %{
2120   predicate((UseSSE>=2) && (UseAVX == 0));
2121   match(Set dst (SubD dst (LoadD src)));
2122 
2123   format %{ "subsd   $dst, $src" %}
2124   ins_cost(150);
2125   ins_encode %{
2126     __ subsd($dst$$XMMRegister, $src$$Address);
2127   %}
2128   ins_pipe(pipe_slow);
2129 %}
2130 
2131 instruct subD_imm(regD dst, immD con) %{
2132   predicate((UseSSE>=2) && (UseAVX == 0));
2133   match(Set dst (SubD dst con));
2134   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2135   ins_cost(150);
2136   ins_encode %{
2137     __ subsd($dst$$XMMRegister, $constantaddress($con));
2138   %}
2139   ins_pipe(pipe_slow);
2140 %}
2141 
2142 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2143   predicate(UseAVX > 0);
2144   match(Set dst (SubD src1 src2));
2145 
2146   format %{ "vsubsd  $dst, $src1, $src2" %}
2147   ins_cost(150);
2148   ins_encode %{
2149     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2150   %}
2151   ins_pipe(pipe_slow);
2152 %}
2153 
2154 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2155   predicate(UseAVX > 0);
2156   match(Set dst (SubD src1 (LoadD src2)));
2157 
2158   format %{ "vsubsd  $dst, $src1, $src2" %}
2159   ins_cost(150);
2160   ins_encode %{
2161     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2162   %}
2163   ins_pipe(pipe_slow);
2164 %}
2165 
2166 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2167   predicate(UseAVX > 0);
2168   match(Set dst (SubD src con));
2169 
2170   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2171   ins_cost(150);
2172   ins_encode %{
2173     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2174   %}
2175   ins_pipe(pipe_slow);
2176 %}
2177 
2178 instruct mulF_reg(regF dst, regF src) %{
2179   predicate((UseSSE>=1) && (UseAVX == 0));
2180   match(Set dst (MulF dst src));
2181 
2182   format %{ "mulss   $dst, $src" %}
2183   ins_cost(150);
2184   ins_encode %{
2185     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2186   %}
2187   ins_pipe(pipe_slow);
2188 %}
2189 
2190 instruct mulF_mem(regF dst, memory src) %{
2191   predicate((UseSSE>=1) && (UseAVX == 0));
2192   match(Set dst (MulF dst (LoadF src)));
2193 
2194   format %{ "mulss   $dst, $src" %}
2195   ins_cost(150);
2196   ins_encode %{
2197     __ mulss($dst$$XMMRegister, $src$$Address);
2198   %}
2199   ins_pipe(pipe_slow);
2200 %}
2201 
2202 instruct mulF_imm(regF dst, immF con) %{
2203   predicate((UseSSE>=1) && (UseAVX == 0));
2204   match(Set dst (MulF dst con));
2205   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2206   ins_cost(150);
2207   ins_encode %{
2208     __ mulss($dst$$XMMRegister, $constantaddress($con));
2209   %}
2210   ins_pipe(pipe_slow);
2211 %}
2212 
2213 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2214   predicate(UseAVX > 0);
2215   match(Set dst (MulF src1 src2));
2216 
2217   format %{ "vmulss  $dst, $src1, $src2" %}
2218   ins_cost(150);
2219   ins_encode %{
2220     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2221   %}
2222   ins_pipe(pipe_slow);
2223 %}
2224 
2225 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2226   predicate(UseAVX > 0);
2227   match(Set dst (MulF src1 (LoadF src2)));
2228 
2229   format %{ "vmulss  $dst, $src1, $src2" %}
2230   ins_cost(150);
2231   ins_encode %{
2232     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2233   %}
2234   ins_pipe(pipe_slow);
2235 %}
2236 
2237 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2238   predicate(UseAVX > 0);
2239   match(Set dst (MulF src con));
2240 
2241   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2242   ins_cost(150);
2243   ins_encode %{
2244     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2245   %}
2246   ins_pipe(pipe_slow);
2247 %}
2248 
2249 instruct mulD_reg(regD dst, regD src) %{
2250   predicate((UseSSE>=2) && (UseAVX == 0));
2251   match(Set dst (MulD dst src));
2252 
2253   format %{ "mulsd   $dst, $src" %}
2254   ins_cost(150);
2255   ins_encode %{
2256     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2257   %}
2258   ins_pipe(pipe_slow);
2259 %}
2260 
2261 instruct mulD_mem(regD dst, memory src) %{
2262   predicate((UseSSE>=2) && (UseAVX == 0));
2263   match(Set dst (MulD dst (LoadD src)));
2264 
2265   format %{ "mulsd   $dst, $src" %}
2266   ins_cost(150);
2267   ins_encode %{
2268     __ mulsd($dst$$XMMRegister, $src$$Address);
2269   %}
2270   ins_pipe(pipe_slow);
2271 %}
2272 
2273 instruct mulD_imm(regD dst, immD con) %{
2274   predicate((UseSSE>=2) && (UseAVX == 0));
2275   match(Set dst (MulD dst con));
2276   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2277   ins_cost(150);
2278   ins_encode %{
2279     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2280   %}
2281   ins_pipe(pipe_slow);
2282 %}
2283 
2284 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2285   predicate(UseAVX > 0);
2286   match(Set dst (MulD src1 src2));
2287 
2288   format %{ "vmulsd  $dst, $src1, $src2" %}
2289   ins_cost(150);
2290   ins_encode %{
2291     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2292   %}
2293   ins_pipe(pipe_slow);
2294 %}
2295 
2296 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2297   predicate(UseAVX > 0);
2298   match(Set dst (MulD src1 (LoadD src2)));
2299 
2300   format %{ "vmulsd  $dst, $src1, $src2" %}
2301   ins_cost(150);
2302   ins_encode %{
2303     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2304   %}
2305   ins_pipe(pipe_slow);
2306 %}
2307 
2308 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2309   predicate(UseAVX > 0);
2310   match(Set dst (MulD src con));
2311 
2312   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2313   ins_cost(150);
2314   ins_encode %{
2315     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2316   %}
2317   ins_pipe(pipe_slow);
2318 %}
2319 
2320 instruct divF_reg(regF dst, regF src) %{
2321   predicate((UseSSE>=1) && (UseAVX == 0));
2322   match(Set dst (DivF dst src));
2323 
2324   format %{ "divss   $dst, $src" %}
2325   ins_cost(150);
2326   ins_encode %{
2327     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2328   %}
2329   ins_pipe(pipe_slow);
2330 %}
2331 
2332 instruct divF_mem(regF dst, memory src) %{
2333   predicate((UseSSE>=1) && (UseAVX == 0));
2334   match(Set dst (DivF dst (LoadF src)));
2335 
2336   format %{ "divss   $dst, $src" %}
2337   ins_cost(150);
2338   ins_encode %{
2339     __ divss($dst$$XMMRegister, $src$$Address);
2340   %}
2341   ins_pipe(pipe_slow);
2342 %}
2343 
2344 instruct divF_imm(regF dst, immF con) %{
2345   predicate((UseSSE>=1) && (UseAVX == 0));
2346   match(Set dst (DivF dst con));
2347   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2348   ins_cost(150);
2349   ins_encode %{
2350     __ divss($dst$$XMMRegister, $constantaddress($con));
2351   %}
2352   ins_pipe(pipe_slow);
2353 %}
2354 
2355 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2356   predicate(UseAVX > 0);
2357   match(Set dst (DivF src1 src2));
2358 
2359   format %{ "vdivss  $dst, $src1, $src2" %}
2360   ins_cost(150);
2361   ins_encode %{
2362     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2363   %}
2364   ins_pipe(pipe_slow);
2365 %}
2366 
2367 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2368   predicate(UseAVX > 0);
2369   match(Set dst (DivF src1 (LoadF src2)));
2370 
2371   format %{ "vdivss  $dst, $src1, $src2" %}
2372   ins_cost(150);
2373   ins_encode %{
2374     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2375   %}
2376   ins_pipe(pipe_slow);
2377 %}
2378 
2379 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2380   predicate(UseAVX > 0);
2381   match(Set dst (DivF src con));
2382 
2383   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2384   ins_cost(150);
2385   ins_encode %{
2386     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2387   %}
2388   ins_pipe(pipe_slow);
2389 %}
2390 
2391 instruct divD_reg(regD dst, regD src) %{
2392   predicate((UseSSE>=2) && (UseAVX == 0));
2393   match(Set dst (DivD dst src));
2394 
2395   format %{ "divsd   $dst, $src" %}
2396   ins_cost(150);
2397   ins_encode %{
2398     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2399   %}
2400   ins_pipe(pipe_slow);
2401 %}
2402 
2403 instruct divD_mem(regD dst, memory src) %{
2404   predicate((UseSSE>=2) && (UseAVX == 0));
2405   match(Set dst (DivD dst (LoadD src)));
2406 
2407   format %{ "divsd   $dst, $src" %}
2408   ins_cost(150);
2409   ins_encode %{
2410     __ divsd($dst$$XMMRegister, $src$$Address);
2411   %}
2412   ins_pipe(pipe_slow);
2413 %}
2414 
2415 instruct divD_imm(regD dst, immD con) %{
2416   predicate((UseSSE>=2) && (UseAVX == 0));
2417   match(Set dst (DivD dst con));
2418   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2419   ins_cost(150);
2420   ins_encode %{
2421     __ divsd($dst$$XMMRegister, $constantaddress($con));
2422   %}
2423   ins_pipe(pipe_slow);
2424 %}
2425 
2426 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2427   predicate(UseAVX > 0);
2428   match(Set dst (DivD src1 src2));
2429 
2430   format %{ "vdivsd  $dst, $src1, $src2" %}
2431   ins_cost(150);
2432   ins_encode %{
2433     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2434   %}
2435   ins_pipe(pipe_slow);
2436 %}
2437 
2438 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2439   predicate(UseAVX > 0);
2440   match(Set dst (DivD src1 (LoadD src2)));
2441 
2442   format %{ "vdivsd  $dst, $src1, $src2" %}
2443   ins_cost(150);
2444   ins_encode %{
2445     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2446   %}
2447   ins_pipe(pipe_slow);
2448 %}
2449 
2450 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2451   predicate(UseAVX > 0);
2452   match(Set dst (DivD src con));
2453 
2454   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2455   ins_cost(150);
2456   ins_encode %{
2457     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2458   %}
2459   ins_pipe(pipe_slow);
2460 %}
2461 
2462 instruct absF_reg(regF dst) %{
2463   predicate((UseSSE>=1) && (UseAVX == 0));
2464   match(Set dst (AbsF dst));
2465   ins_cost(150);
2466   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2467   ins_encode %{
2468     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2469   %}
2470   ins_pipe(pipe_slow);
2471 %}
2472 
2473 instruct absF_reg_reg(regF dst, regF src) %{
2474   predicate(VM_Version::supports_avxonly());
2475   match(Set dst (AbsF src));
2476   ins_cost(150);
2477   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2478   ins_encode %{
2479     int vector_len = 0;
2480     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2481               ExternalAddress(float_signmask()), vector_len);
2482   %}
2483   ins_pipe(pipe_slow);
2484 %}
2485 
2486 #ifdef _LP64
2487 instruct absF_reg_reg_evex(regF dst, regF src) %{
2488   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2489   match(Set dst (AbsF src));
2490   ins_cost(150);
2491   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2492   ins_encode %{
2493     int vector_len = 0;
2494     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2495               ExternalAddress(float_signmask()), vector_len);
2496   %}
2497   ins_pipe(pipe_slow);
2498 %}
2499 
2500 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2501   predicate(VM_Version::supports_avx512novl());
2502   match(Set dst (AbsF src1));
2503   effect(TEMP src2);
2504   ins_cost(150);
2505   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2506   ins_encode %{
2507     int vector_len = 0;
2508     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2509               ExternalAddress(float_signmask()), vector_len);
2510   %}
2511   ins_pipe(pipe_slow);
2512 %}
2513 #else // _LP64
2514 instruct absF_reg_reg_evex(regF dst, regF src) %{
2515   predicate(UseAVX > 2);
2516   match(Set dst (AbsF src));
2517   ins_cost(150);
2518   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2519   ins_encode %{
2520     int vector_len = 0;
2521     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2522               ExternalAddress(float_signmask()), vector_len);
2523   %}
2524   ins_pipe(pipe_slow);
2525 %}
2526 #endif
2527 
2528 instruct absD_reg(regD dst) %{
2529   predicate((UseSSE>=2) && (UseAVX == 0));
2530   match(Set dst (AbsD dst));
2531   ins_cost(150);
2532   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2533             "# abs double by sign masking" %}
2534   ins_encode %{
2535     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2536   %}
2537   ins_pipe(pipe_slow);
2538 %}
2539 
2540 instruct absD_reg_reg(regD dst, regD src) %{
2541   predicate(VM_Version::supports_avxonly());
2542   match(Set dst (AbsD src));
2543   ins_cost(150);
2544   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2545             "# abs double by sign masking" %}
2546   ins_encode %{
2547     int vector_len = 0;
2548     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2549               ExternalAddress(double_signmask()), vector_len);
2550   %}
2551   ins_pipe(pipe_slow);
2552 %}
2553 
2554 #ifdef _LP64
2555 instruct absD_reg_reg_evex(regD dst, regD src) %{
2556   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2557   match(Set dst (AbsD src));
2558   ins_cost(150);
2559   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2560             "# abs double by sign masking" %}
2561   ins_encode %{
2562     int vector_len = 0;
2563     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2564               ExternalAddress(double_signmask()), vector_len);
2565   %}
2566   ins_pipe(pipe_slow);
2567 %}
2568 
2569 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2570   predicate(VM_Version::supports_avx512novl());
2571   match(Set dst (AbsD src1));
2572   effect(TEMP src2);
2573   ins_cost(150);
2574   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2575   ins_encode %{
2576     int vector_len = 0;
2577     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2578               ExternalAddress(double_signmask()), vector_len);
2579   %}
2580   ins_pipe(pipe_slow);
2581 %}
2582 #else // _LP64
2583 instruct absD_reg_reg_evex(regD dst, regD src) %{
2584   predicate(UseAVX > 2);
2585   match(Set dst (AbsD src));
2586   ins_cost(150);
2587   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2588             "# abs double by sign masking" %}
2589   ins_encode %{
2590     int vector_len = 0;
2591     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2592               ExternalAddress(double_signmask()), vector_len);
2593   %}
2594   ins_pipe(pipe_slow);
2595 %}
2596 #endif
2597 
2598 instruct negF_reg(regF dst) %{
2599   predicate((UseSSE>=1) && (UseAVX == 0));
2600   match(Set dst (NegF dst));
2601   ins_cost(150);
2602   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2603   ins_encode %{
2604     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2605   %}
2606   ins_pipe(pipe_slow);
2607 %}
2608 
2609 instruct negF_reg_reg(regF dst, regF src) %{
2610   predicate(UseAVX > 0);
2611   match(Set dst (NegF src));
2612   ins_cost(150);
2613   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2614   ins_encode %{
2615     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2616                  ExternalAddress(float_signflip()));
2617   %}
2618   ins_pipe(pipe_slow);
2619 %}
2620 
2621 instruct negD_reg(regD dst) %{
2622   predicate((UseSSE>=2) && (UseAVX == 0));
2623   match(Set dst (NegD dst));
2624   ins_cost(150);
2625   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2626             "# neg double by sign flipping" %}
2627   ins_encode %{
2628     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2629   %}
2630   ins_pipe(pipe_slow);
2631 %}
2632 
2633 instruct negD_reg_reg(regD dst, regD src) %{
2634   predicate(UseAVX > 0);
2635   match(Set dst (NegD src));
2636   ins_cost(150);
2637   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2638             "# neg double by sign flipping" %}
2639   ins_encode %{
2640     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2641                  ExternalAddress(double_signflip()));
2642   %}
2643   ins_pipe(pipe_slow);
2644 %}
2645 
2646 instruct sqrtF_reg(regF dst, regF src) %{
2647   predicate(UseSSE>=1);
2648   match(Set dst (SqrtF src));
2649 
2650   format %{ "sqrtss  $dst, $src" %}
2651   ins_cost(150);
2652   ins_encode %{
2653     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2654   %}
2655   ins_pipe(pipe_slow);
2656 %}
2657 
2658 instruct sqrtF_mem(regF dst, memory src) %{
2659   predicate(UseSSE>=1);
2660   match(Set dst (SqrtF (LoadF src)));
2661 
2662   format %{ "sqrtss  $dst, $src" %}
2663   ins_cost(150);
2664   ins_encode %{
2665     __ sqrtss($dst$$XMMRegister, $src$$Address);
2666   %}
2667   ins_pipe(pipe_slow);
2668 %}
2669 
2670 instruct sqrtF_imm(regF dst, immF con) %{
2671   predicate(UseSSE>=1);
2672   match(Set dst (SqrtF con));
2673 
2674   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2675   ins_cost(150);
2676   ins_encode %{
2677     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2678   %}
2679   ins_pipe(pipe_slow);
2680 %}
2681 
2682 instruct sqrtD_reg(regD dst, regD src) %{
2683   predicate(UseSSE>=2);
2684   match(Set dst (SqrtD src));
2685 
2686   format %{ "sqrtsd  $dst, $src" %}
2687   ins_cost(150);
2688   ins_encode %{
2689     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2690   %}
2691   ins_pipe(pipe_slow);
2692 %}
2693 
2694 instruct sqrtD_mem(regD dst, memory src) %{
2695   predicate(UseSSE>=2);
2696   match(Set dst (SqrtD (LoadD src)));
2697 
2698   format %{ "sqrtsd  $dst, $src" %}
2699   ins_cost(150);
2700   ins_encode %{
2701     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2702   %}
2703   ins_pipe(pipe_slow);
2704 %}
2705 
2706 instruct sqrtD_imm(regD dst, immD con) %{
2707   predicate(UseSSE>=2);
2708   match(Set dst (SqrtD con));
2709   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2710   ins_cost(150);
2711   ins_encode %{
2712     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2713   %}
2714   ins_pipe(pipe_slow);
2715 %}
2716 
2717 instruct onspinwait() %{
2718   match(OnSpinWait);
2719   ins_cost(200);
2720 
2721   format %{
2722     $$template
2723     if (os::is_MP()) {
2724       $$emit$$"pause\t! membar_onspinwait"
2725     } else {
2726       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
2727     }
2728   %}
2729   ins_encode %{
2730     __ pause();
2731   %}
2732   ins_pipe(pipe_slow);
2733 %}
2734 
2735 // a * b + c
2736 instruct fmaD_reg(regD a, regD b, regD c) %{
2737   predicate(UseFMA);
2738   match(Set c (FmaD  c (Binary a b)));
2739   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2740   ins_cost(150);
2741   ins_encode %{
2742     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2743   %}
2744   ins_pipe( pipe_slow );
2745 %}
2746 
2747 // a * b + c
2748 instruct fmaF_reg(regF a, regF b, regF c) %{
2749   predicate(UseFMA);
2750   match(Set c (FmaF  c (Binary a b)));
2751   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2752   ins_cost(150);
2753   ins_encode %{
2754     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2755   %}
2756   ins_pipe( pipe_slow );
2757 %}
2758 
2759 // ====================VECTOR INSTRUCTIONS=====================================
2760 
2761 instruct reinterpretS(vecS dst) %{
2762   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
2763   match(Set dst (VectorReinterpret dst));
2764   ins_cost(125);
2765   format %{ " # reinterpret $dst" %}
2766   ins_encode %{
2767     // empty
2768   %}
2769   ins_pipe( pipe_slow );
2770 %}
2771 
2772 instruct reinterpretS2D(vecD dst, vecS src) %{
2773   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
2774   match(Set dst (VectorReinterpret src));
2775   ins_cost(125);
2776   effect(TEMP dst);
2777   format %{ " # reinterpret $dst,$src" %}
2778   ins_encode %{
2779     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2780     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2781   %}
2782   ins_pipe( pipe_slow );
2783 %}
2784 
2785 instruct reinterpretS2X(vecX dst, vecS src) %{
2786   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
2787   match(Set dst (VectorReinterpret src));
2788   ins_cost(125);
2789   effect(TEMP dst);
2790   format %{ " # reinterpret $dst,$src" %}
2791   ins_encode %{
2792     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2793     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2794   %}
2795   ins_pipe( pipe_slow );
2796 %}
2797 
2798 instruct reinterpretS2Y(vecY dst, vecS src) %{
2799   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
2800   match(Set dst (VectorReinterpret src));
2801   ins_cost(125);
2802   effect(TEMP dst);
2803   format %{ " # reinterpret $dst,$src" %}
2804   ins_encode %{
2805     int vector_len = 1;
2806     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
2807     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2808   %}
2809   ins_pipe( pipe_slow );
2810 %}
2811 
2812 instruct reinterpretS2Z(vecZ dst, vecS src) %{
2813   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
2814   match(Set dst (VectorReinterpret src));
2815   ins_cost(125);
2816   effect(TEMP dst);
2817   format %{ " # reinterpret $dst,$src" %}
2818   ins_encode %{
2819     int vector_len = 2;
2820     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
2821     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2822   %}
2823   ins_pipe( pipe_slow );
2824 %}
2825 
2826 instruct reinterpretD2S(vecS dst, vecD src) %{
2827   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
2828   match(Set dst (VectorReinterpret src));
2829   ins_cost(125);
2830   format %{ " # reinterpret $dst,$src" %}
2831   ins_encode %{
2832     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2833   %}
2834   ins_pipe( pipe_slow );
2835 %}
2836 
2837 instruct reinterpretD(vecD dst) %{
2838   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
2839   match(Set dst (VectorReinterpret dst));
2840   ins_cost(125);
2841   format %{ " # reinterpret $dst" %}
2842   ins_encode %{
2843     // empty
2844   %}
2845   ins_pipe( pipe_slow );
2846 %}
2847 
2848 instruct reinterpretD2X(vecX dst, vecD src) %{
2849   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
2850   match(Set dst (VectorReinterpret src));
2851   ins_cost(125);
2852   effect(TEMP dst);
2853   format %{ " # reinterpret $dst,$src" %}
2854   ins_encode %{
2855     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2856     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2857   %}
2858   ins_pipe( pipe_slow );
2859 %}
2860 
2861 instruct reinterpretD2Y(vecY dst, vecD src) %{
2862   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
2863   match(Set dst (VectorReinterpret src));
2864   ins_cost(125);
2865   effect(TEMP dst);
2866   format %{ " # reinterpret $dst,$src" %}
2867   ins_encode %{
2868     int vector_len = 1;
2869     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
2870     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2871   %}
2872   ins_pipe( pipe_slow );
2873 %}
2874 
2875 instruct reinterpretD2Z(vecZ dst, vecD src) %{
2876   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
2877   match(Set dst (VectorReinterpret src));
2878   ins_cost(125);
2879   effect(TEMP dst);
2880   format %{ " # reinterpret $dst,$src" %}
2881   ins_encode %{
2882     int vector_len = 2;
2883     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
2884     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2885   %}
2886   ins_pipe( pipe_slow );
2887 %}
2888 
2889 instruct reinterpretX2S(vecS dst, vecX src) %{
2890   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
2891   match(Set dst (VectorReinterpret src));
2892   ins_cost(125);
2893   format %{ " # reinterpret $dst,$src" %}
2894   ins_encode %{
2895     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2896   %}
2897   ins_pipe( pipe_slow );
2898 %}
2899 
2900 instruct reinterpretX2D(vecD dst, vecX src) %{
2901   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
2902   match(Set dst (VectorReinterpret src));
2903   ins_cost(125);
2904   format %{ " # reinterpret $dst,$src" %}
2905   ins_encode %{
2906     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2907   %}
2908   ins_pipe( pipe_slow );
2909 %}
2910 
2911 instruct reinterpretX(vecX dst) %{
2912   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
2913   match(Set dst (VectorReinterpret dst));
2914   ins_cost(125);
2915   format %{ " # reinterpret $dst" %}
2916   ins_encode %{
2917     // empty
2918   %}
2919   ins_pipe( pipe_slow );
2920 %}
2921 
2922 instruct reinterpretX2Y(vecY dst, vecX src) %{
2923   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
2924   match(Set dst (VectorReinterpret src));
2925   ins_cost(125);
2926   effect(TEMP dst);
2927   format %{ " # reinterpret $dst,$src" %}
2928   ins_encode %{
2929     int vector_len = 1;
2930     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
2931     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2932   %}
2933   ins_pipe( pipe_slow );
2934 %}
2935 
2936 instruct reinterpretX2Z(vecZ dst, vecX src) %{
2937   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
2938   match(Set dst (VectorReinterpret src));
2939   ins_cost(125);
2940   effect(TEMP dst);
2941   format %{ " # reinterpret $dst,$src" %}
2942   ins_encode %{
2943     int vector_len = 2;
2944     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
2945     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2946   %}
2947   ins_pipe( pipe_slow );
2948 %}
2949 
2950 instruct reinterpretY2S(vecS dst, vecY src) %{
2951   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
2952   match(Set dst (VectorReinterpret src));
2953   ins_cost(125);
2954   format %{ " # reinterpret $dst,$src" %}
2955   ins_encode %{
2956     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2957   %}
2958   ins_pipe( pipe_slow );
2959 %}
2960 
2961 instruct reinterpretY2D(vecD dst, vecY src) %{
2962   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
2963   match(Set dst (VectorReinterpret src));
2964   ins_cost(125);
2965   format %{ " # reinterpret $dst,$src" %}
2966   ins_encode %{
2967     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2968   %}
2969   ins_pipe( pipe_slow );
2970 %}
2971 
2972 instruct reinterpretY2X(vecX dst, vecY src) %{
2973   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
2974   match(Set dst (VectorReinterpret src));
2975   ins_cost(125);
2976   format %{ " # reinterpret $dst,$src" %}
2977   ins_encode %{
2978     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2979   %}
2980   ins_pipe( pipe_slow );
2981 %}
2982 
2983 instruct reinterpretY(vecY dst) %{
2984   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
2985   match(Set dst (VectorReinterpret dst));
2986   ins_cost(125);
2987   format %{ " # reinterpret $dst" %}
2988   ins_encode %{
2989     // empty
2990   %}
2991   ins_pipe( pipe_slow );
2992 %}
2993 
2994 instruct reinterpretY2Z(vecZ dst, vecY src) %{
2995   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
2996   match(Set dst (VectorReinterpret src));
2997   ins_cost(125);
2998   effect(TEMP dst);
2999   format %{ " # reinterpret $dst,$src" %}
3000   ins_encode %{
3001     int vector_len = 2;
3002     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3003     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3004   %}
3005   ins_pipe( pipe_slow );
3006 %}
3007 
3008 instruct reinterpretZ2S(vecS dst, vecZ src) %{
3009   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3010   match(Set dst (VectorReinterpret src));
3011   ins_cost(125);
3012   format %{ " # reinterpret $dst,$src" %}
3013   ins_encode %{
3014     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3015   %}
3016   ins_pipe( pipe_slow );
3017 %}
3018 
3019 instruct reinterpretZ2D(vecD dst, vecZ src) %{
3020   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3021   match(Set dst (VectorReinterpret src));
3022   ins_cost(125);
3023   format %{ " # reinterpret $dst,$src" %}
3024   ins_encode %{
3025     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3026   %}
3027   ins_pipe( pipe_slow );
3028 %}
3029 
3030 instruct reinterpretZ2X(vecX dst, vecZ src) %{
3031   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3032   match(Set dst (VectorReinterpret src));
3033   ins_cost(125);
3034   format %{ " # reinterpret $dst,$src" %}
3035   ins_encode %{
3036     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3037   %}
3038   ins_pipe( pipe_slow );
3039 %}
3040 
3041 instruct reinterpretZ2Y(vecY dst, vecZ src) %{
3042   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3043   match(Set dst (VectorReinterpret src));
3044   ins_cost(125);
3045   format %{ " # reinterpret $dst,$src" %}
3046   ins_encode %{
3047     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3048   %}
3049   ins_pipe( pipe_slow );
3050 %}
3051 
3052 instruct reinterpretZ(vecZ dst) %{
3053   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3054   match(Set dst (VectorReinterpret dst));
3055   ins_cost(125);
3056   format %{ " # reinterpret $dst" %}
3057   ins_encode %{
3058     // empty
3059   %}
3060   ins_pipe( pipe_slow );
3061 %}
3062 
3063 // ==========
3064 
3065 // Load vectors (1 byte long)
3066 instruct loadV1(vecS dst, memory mem, rRegI tmp) %{
3067   predicate(n->as_LoadVector()->memory_size() == 1);
3068   match(Set dst (LoadVector mem));
3069   ins_cost(125);
3070   effect(TEMP tmp);
3071   format %{ "movzbl $tmp,$mem\n\t"
3072           "movd $dst,$tmp\t! load vector (1 byte)" %}
3073   ins_encode %{
3074     __ movzbl($tmp$$Register, $mem$$Address);
3075     __ movdl($dst$$XMMRegister, $tmp$$Register);
3076   %}
3077   ins_pipe( pipe_slow );
3078 %}
3079 
3080 // Load vectors (2 bytes long)
3081 instruct loadV2(vecS dst, memory mem, rRegI tmp) %{
3082   predicate(n->as_LoadVector()->memory_size() == 2);
3083   match(Set dst (LoadVector mem));
3084   ins_cost(125);
3085   effect(TEMP tmp);
3086   format %{ "movzwl $tmp,$mem\n\t"
3087           "movd $dst,$tmp\t! load vector (2 bytes)" %}
3088   ins_encode %{
3089     __ movzwl($tmp$$Register, $mem$$Address);
3090     __ movdl($dst$$XMMRegister, $tmp$$Register);
3091   %}
3092   ins_pipe( pipe_slow );
3093 %}
3094 
3095 // Load vectors (4 bytes long)
3096 instruct loadV4(vecS dst, memory mem) %{
3097   predicate(n->as_LoadVector()->memory_size() == 4);
3098   match(Set dst (LoadVector mem));
3099   ins_cost(125);
3100   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
3101   ins_encode %{
3102     __ movdl($dst$$XMMRegister, $mem$$Address);
3103   %}
3104   ins_pipe( pipe_slow );
3105 %}
3106 
3107 // Load vectors (8 bytes long)
3108 instruct loadV8(vecD dst, memory mem) %{
3109   predicate(n->as_LoadVector()->memory_size() == 8);
3110   match(Set dst (LoadVector mem));
3111   ins_cost(125);
3112   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
3113   ins_encode %{
3114     __ movq($dst$$XMMRegister, $mem$$Address);
3115   %}
3116   ins_pipe( pipe_slow );
3117 %}
3118 
3119 // Load vectors (16 bytes long)
3120 instruct loadV16(vecX dst, memory mem) %{
3121   predicate(n->as_LoadVector()->memory_size() == 16);
3122   match(Set dst (LoadVector mem));
3123   ins_cost(125);
3124   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3125   ins_encode %{
3126     __ movdqu($dst$$XMMRegister, $mem$$Address);
3127   %}
3128   ins_pipe( pipe_slow );
3129 %}
3130 
3131 // Load vectors (32 bytes long)
3132 instruct loadV32(vecY dst, memory mem) %{
3133   predicate(n->as_LoadVector()->memory_size() == 32);
3134   match(Set dst (LoadVector mem));
3135   ins_cost(125);
3136   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3137   ins_encode %{
3138     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3139   %}
3140   ins_pipe( pipe_slow );
3141 %}
3142 
3143 // Load vectors (64 bytes long)
3144 instruct loadV64_dword(vecZ dst, memory mem) %{
3145   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3146   match(Set dst (LoadVector mem));
3147   ins_cost(125);
3148   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3149   ins_encode %{
3150     int vector_len = 2;
3151     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3152   %}
3153   ins_pipe( pipe_slow );
3154 %}
3155 
3156 // Load vectors (64 bytes long)
3157 instruct loadV64_qword(vecZ dst, memory mem) %{
3158   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3159   match(Set dst (LoadVector mem));
3160   ins_cost(125);
3161   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3162   ins_encode %{
3163     int vector_len = 2;
3164     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3165   %}
3166   ins_pipe( pipe_slow );
3167 %}
3168 
3169 // Store vectors
3170 instruct storeV1(memory mem, vecS src, rRegI tmp) %{
3171   predicate(n->as_StoreVector()->memory_size() == 1);
3172   match(Set mem (StoreVector mem src));
3173   ins_cost(145);
3174   effect(TEMP tmp);
3175   format %{ "movd $tmp,$src\n\t"
3176           "movb $mem,$tmp\t! store vector (1 byte)" %}
3177   ins_encode %{
3178     __ movdl($tmp$$Register, $src$$XMMRegister);
3179     __ movb($mem$$Address, $tmp$$Register);
3180   %}
3181   ins_pipe( pipe_slow );
3182 %}
3183 
3184 instruct storeV2(memory mem, vecS src, rRegI tmp) %{
3185   predicate(n->as_StoreVector()->memory_size() == 2);
3186   match(Set mem (StoreVector mem src));
3187   ins_cost(145);
3188   effect(TEMP tmp);
3189   format %{ "movd $tmp,$src\n\t"
3190           "movw $mem,$tmp\t! store vector (2 bytes)" %}
3191   ins_encode %{
3192     __ movdl($tmp$$Register, $src$$XMMRegister);
3193     __ movw($mem$$Address, $tmp$$Register);
3194   %}
3195   ins_pipe( pipe_slow );
3196 %}
3197 
3198 instruct storeV4(memory mem, vecS src) %{
3199   predicate(n->as_StoreVector()->memory_size() == 4);
3200   match(Set mem (StoreVector mem src));
3201   ins_cost(145);
3202   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3203   ins_encode %{
3204     __ movdl($mem$$Address, $src$$XMMRegister);
3205   %}
3206   ins_pipe( pipe_slow );
3207 %}
3208 
3209 instruct storeV8(memory mem, vecD src) %{
3210   predicate(n->as_StoreVector()->memory_size() == 8);
3211   match(Set mem (StoreVector mem src));
3212   ins_cost(145);
3213   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3214   ins_encode %{
3215     __ movq($mem$$Address, $src$$XMMRegister);
3216   %}
3217   ins_pipe( pipe_slow );
3218 %}
3219 
3220 instruct storeV16(memory mem, vecX src) %{
3221   predicate(n->as_StoreVector()->memory_size() == 16);
3222   match(Set mem (StoreVector mem src));
3223   ins_cost(145);
3224   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3225   ins_encode %{
3226     __ movdqu($mem$$Address, $src$$XMMRegister);
3227   %}
3228   ins_pipe( pipe_slow );
3229 %}
3230 
3231 instruct storeV32(memory mem, vecY src) %{
3232   predicate(n->as_StoreVector()->memory_size() == 32);
3233   match(Set mem (StoreVector mem src));
3234   ins_cost(145);
3235   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3236   ins_encode %{
3237     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3238   %}
3239   ins_pipe( pipe_slow );
3240 %}
3241 
3242 instruct storeV64_dword(memory mem, vecZ src) %{
3243   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3244   match(Set mem (StoreVector mem src));
3245   ins_cost(145);
3246   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3247   ins_encode %{
3248     int vector_len = 2;
3249     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3250   %}
3251   ins_pipe( pipe_slow );
3252 %}
3253 
3254 instruct storeV64_qword(memory mem, vecZ src) %{
3255   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3256   match(Set mem (StoreVector mem src));
3257   ins_cost(145);
3258   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3259   ins_encode %{
3260     int vector_len = 2;
3261     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3262   %}
3263   ins_pipe( pipe_slow );
3264 %}
3265 
3266 // ====================LEGACY REPLICATE=======================================
3267 
3268 instruct Repl4B_mem(vecS dst, memory mem) %{
3269   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3270   match(Set dst (ReplicateB (LoadB mem)));
3271   format %{ "punpcklbw $dst,$mem\n\t"
3272             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3273   ins_encode %{
3274     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3275     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3276   %}
3277   ins_pipe( pipe_slow );
3278 %}
3279 
3280 instruct Repl8B_mem(vecD dst, memory mem) %{
3281   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3282   match(Set dst (ReplicateB (LoadB mem)));
3283   format %{ "punpcklbw $dst,$mem\n\t"
3284             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3285   ins_encode %{
3286     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3287     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3288   %}
3289   ins_pipe( pipe_slow );
3290 %}
3291 
3292 instruct Repl16B(vecX dst, rRegI src) %{
3293   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3294   match(Set dst (ReplicateB src));
3295   format %{ "movd    $dst,$src\n\t"
3296             "punpcklbw $dst,$dst\n\t"
3297             "pshuflw $dst,$dst,0x00\n\t"
3298             "punpcklqdq $dst,$dst\t! replicate16B" %}
3299   ins_encode %{
3300     __ movdl($dst$$XMMRegister, $src$$Register);
3301     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3302     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3303     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3304   %}
3305   ins_pipe( pipe_slow );
3306 %}
3307 
3308 instruct Repl16B_mem(vecX dst, memory mem) %{
3309   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3310   match(Set dst (ReplicateB (LoadB mem)));
3311   format %{ "punpcklbw $dst,$mem\n\t"
3312             "pshuflw $dst,$dst,0x00\n\t"
3313             "punpcklqdq $dst,$dst\t! replicate16B" %}
3314   ins_encode %{
3315     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3316     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3317     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3318   %}
3319   ins_pipe( pipe_slow );
3320 %}
3321 
3322 instruct Repl32B(vecY dst, rRegI src) %{
3323   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3324   match(Set dst (ReplicateB src));
3325   format %{ "movd    $dst,$src\n\t"
3326             "punpcklbw $dst,$dst\n\t"
3327             "pshuflw $dst,$dst,0x00\n\t"
3328             "punpcklqdq $dst,$dst\n\t"
3329             "vinserti128_high $dst,$dst\t! replicate32B" %}
3330   ins_encode %{
3331     __ movdl($dst$$XMMRegister, $src$$Register);
3332     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3333     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3334     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3335     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3336   %}
3337   ins_pipe( pipe_slow );
3338 %}
3339 
3340 instruct Repl32B_mem(vecY dst, memory mem) %{
3341   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3342   match(Set dst (ReplicateB (LoadB mem)));
3343   format %{ "punpcklbw $dst,$mem\n\t"
3344             "pshuflw $dst,$dst,0x00\n\t"
3345             "punpcklqdq $dst,$dst\n\t"
3346             "vinserti128_high $dst,$dst\t! replicate32B" %}
3347   ins_encode %{
3348     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3349     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3350     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3351     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct Repl16B_imm(vecX dst, immI con) %{
3357   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3358   match(Set dst (ReplicateB con));
3359   format %{ "movq    $dst,[$constantaddress]\n\t"
3360             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3361   ins_encode %{
3362     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3363     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3364   %}
3365   ins_pipe( pipe_slow );
3366 %}
3367 
3368 instruct Repl32B_imm(vecY dst, immI con) %{
3369   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3370   match(Set dst (ReplicateB con));
3371   format %{ "movq    $dst,[$constantaddress]\n\t"
3372             "punpcklqdq $dst,$dst\n\t"
3373             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3374   ins_encode %{
3375     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3376     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3377     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3378   %}
3379   ins_pipe( pipe_slow );
3380 %}
3381 
3382 instruct Repl4S(vecD dst, rRegI src) %{
3383   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3384   match(Set dst (ReplicateS src));
3385   format %{ "movd    $dst,$src\n\t"
3386             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3387   ins_encode %{
3388     __ movdl($dst$$XMMRegister, $src$$Register);
3389     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3390   %}
3391   ins_pipe( pipe_slow );
3392 %}
3393 
3394 instruct Repl4S_mem(vecD dst, memory mem) %{
3395   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3396   match(Set dst (ReplicateS (LoadS mem)));
3397   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3398   ins_encode %{
3399     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3400   %}
3401   ins_pipe( pipe_slow );
3402 %}
3403 
3404 instruct Repl8S(vecX dst, rRegI src) %{
3405   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3406   match(Set dst (ReplicateS src));
3407   format %{ "movd    $dst,$src\n\t"
3408             "pshuflw $dst,$dst,0x00\n\t"
3409             "punpcklqdq $dst,$dst\t! replicate8S" %}
3410   ins_encode %{
3411     __ movdl($dst$$XMMRegister, $src$$Register);
3412     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3413     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3414   %}
3415   ins_pipe( pipe_slow );
3416 %}
3417 
3418 instruct Repl8S_mem(vecX dst, memory mem) %{
3419   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3420   match(Set dst (ReplicateS (LoadS mem)));
3421   format %{ "pshuflw $dst,$mem,0x00\n\t"
3422             "punpcklqdq $dst,$dst\t! replicate8S" %}
3423   ins_encode %{
3424     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3425     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3426   %}
3427   ins_pipe( pipe_slow );
3428 %}
3429 
3430 instruct Repl8S_imm(vecX dst, immI con) %{
3431   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3432   match(Set dst (ReplicateS con));
3433   format %{ "movq    $dst,[$constantaddress]\n\t"
3434             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3435   ins_encode %{
3436     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3437     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3438   %}
3439   ins_pipe( pipe_slow );
3440 %}
3441 
3442 instruct Repl16S(vecY dst, rRegI src) %{
3443   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3444   match(Set dst (ReplicateS src));
3445   format %{ "movd    $dst,$src\n\t"
3446             "pshuflw $dst,$dst,0x00\n\t"
3447             "punpcklqdq $dst,$dst\n\t"
3448             "vinserti128_high $dst,$dst\t! replicate16S" %}
3449   ins_encode %{
3450     __ movdl($dst$$XMMRegister, $src$$Register);
3451     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3452     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3453     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3454   %}
3455   ins_pipe( pipe_slow );
3456 %}
3457 
3458 instruct Repl16S_mem(vecY dst, memory mem) %{
3459   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3460   match(Set dst (ReplicateS (LoadS mem)));
3461   format %{ "pshuflw $dst,$mem,0x00\n\t"
3462             "punpcklqdq $dst,$dst\n\t"
3463             "vinserti128_high $dst,$dst\t! replicate16S" %}
3464   ins_encode %{
3465     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3466     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3467     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3468   %}
3469   ins_pipe( pipe_slow );
3470 %}
3471 
3472 instruct Repl16S_imm(vecY dst, immI con) %{
3473   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3474   match(Set dst (ReplicateS con));
3475   format %{ "movq    $dst,[$constantaddress]\n\t"
3476             "punpcklqdq $dst,$dst\n\t"
3477             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3478   ins_encode %{
3479     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3480     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3481     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3482   %}
3483   ins_pipe( pipe_slow );
3484 %}
3485 
3486 instruct Repl4I(vecX dst, rRegI src) %{
3487   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3488   match(Set dst (ReplicateI src));
3489   format %{ "movd    $dst,$src\n\t"
3490             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3491   ins_encode %{
3492     __ movdl($dst$$XMMRegister, $src$$Register);
3493     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3494   %}
3495   ins_pipe( pipe_slow );
3496 %}
3497 
3498 instruct Repl4I_mem(vecX dst, memory mem) %{
3499   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3500   match(Set dst (ReplicateI (LoadI mem)));
3501   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3502   ins_encode %{
3503     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3504   %}
3505   ins_pipe( pipe_slow );
3506 %}
3507 
3508 instruct Repl8I(vecY dst, rRegI src) %{
3509   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3510   match(Set dst (ReplicateI src));
3511   format %{ "movd    $dst,$src\n\t"
3512             "pshufd  $dst,$dst,0x00\n\t"
3513             "vinserti128_high $dst,$dst\t! replicate8I" %}
3514   ins_encode %{
3515     __ movdl($dst$$XMMRegister, $src$$Register);
3516     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3517     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 instruct Repl8I_mem(vecY dst, memory mem) %{
3523   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524   match(Set dst (ReplicateI (LoadI mem)));
3525   format %{ "pshufd  $dst,$mem,0x00\n\t"
3526             "vinserti128_high $dst,$dst\t! replicate8I" %}
3527   ins_encode %{
3528     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3529     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3530   %}
3531   ins_pipe( pipe_slow );
3532 %}
3533 
3534 instruct Repl4I_imm(vecX dst, immI con) %{
3535   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3536   match(Set dst (ReplicateI con));
3537   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3538             "punpcklqdq $dst,$dst" %}
3539   ins_encode %{
3540     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3541     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3542   %}
3543   ins_pipe( pipe_slow );
3544 %}
3545 
3546 instruct Repl8I_imm(vecY dst, immI con) %{
3547   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3548   match(Set dst (ReplicateI con));
3549   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3550             "punpcklqdq $dst,$dst\n\t"
3551             "vinserti128_high $dst,$dst" %}
3552   ins_encode %{
3553     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3554     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3555     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3556   %}
3557   ins_pipe( pipe_slow );
3558 %}
3559 
3560 // Long could be loaded into xmm register directly from memory.
3561 instruct Repl2L_mem(vecX dst, memory mem) %{
3562   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3563   match(Set dst (ReplicateL (LoadL mem)));
3564   format %{ "movq    $dst,$mem\n\t"
3565             "punpcklqdq $dst,$dst\t! replicate2L" %}
3566   ins_encode %{
3567     __ movq($dst$$XMMRegister, $mem$$Address);
3568     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3569   %}
3570   ins_pipe( pipe_slow );
3571 %}
3572 
3573 // Replicate long (8 byte) scalar to be vector
3574 #ifdef _LP64
3575 instruct Repl4L(vecY dst, rRegL src) %{
3576   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3577   match(Set dst (ReplicateL src));
3578   format %{ "movdq   $dst,$src\n\t"
3579             "punpcklqdq $dst,$dst\n\t"
3580             "vinserti128_high $dst,$dst\t! replicate4L" %}
3581   ins_encode %{
3582     __ movdq($dst$$XMMRegister, $src$$Register);
3583     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3584     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3585   %}
3586   ins_pipe( pipe_slow );
3587 %}
3588 #else // _LP64
3589 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3590   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3591   match(Set dst (ReplicateL src));
3592   effect(TEMP dst, USE src, TEMP tmp);
3593   format %{ "movdl   $dst,$src.lo\n\t"
3594             "movdl   $tmp,$src.hi\n\t"
3595             "punpckldq $dst,$tmp\n\t"
3596             "punpcklqdq $dst,$dst\n\t"
3597             "vinserti128_high $dst,$dst\t! replicate4L" %}
3598   ins_encode %{
3599     __ movdl($dst$$XMMRegister, $src$$Register);
3600     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3601     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3602     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3603     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3604   %}
3605   ins_pipe( pipe_slow );
3606 %}
3607 #endif // _LP64
3608 
3609 instruct Repl4L_imm(vecY dst, immL con) %{
3610   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3611   match(Set dst (ReplicateL con));
3612   format %{ "movq    $dst,[$constantaddress]\n\t"
3613             "punpcklqdq $dst,$dst\n\t"
3614             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3615   ins_encode %{
3616     __ movq($dst$$XMMRegister, $constantaddress($con));
3617     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3618     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3619   %}
3620   ins_pipe( pipe_slow );
3621 %}
3622 
3623 instruct Repl4L_mem(vecY dst, memory mem) %{
3624   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3625   match(Set dst (ReplicateL (LoadL mem)));
3626   format %{ "movq    $dst,$mem\n\t"
3627             "punpcklqdq $dst,$dst\n\t"
3628             "vinserti128_high $dst,$dst\t! replicate4L" %}
3629   ins_encode %{
3630     __ movq($dst$$XMMRegister, $mem$$Address);
3631     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3632     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3633   %}
3634   ins_pipe( pipe_slow );
3635 %}
3636 
3637 instruct Repl2F_mem(vecD dst, memory mem) %{
3638   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3639   match(Set dst (ReplicateF (LoadF mem)));
3640   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3641   ins_encode %{
3642     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3643   %}
3644   ins_pipe( pipe_slow );
3645 %}
3646 
3647 instruct Repl4F_mem(vecX dst, memory mem) %{
3648   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3649   match(Set dst (ReplicateF (LoadF mem)));
3650   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3651   ins_encode %{
3652     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3653   %}
3654   ins_pipe( pipe_slow );
3655 %}
3656 
3657 instruct Repl8F(vecY dst, regF src) %{
3658   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3659   match(Set dst (ReplicateF src));
3660   format %{ "pshufd  $dst,$src,0x00\n\t"
3661             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3662   ins_encode %{
3663     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3664     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3665   %}
3666   ins_pipe( pipe_slow );
3667 %}
3668 
3669 instruct Repl8F_mem(vecY dst, memory mem) %{
3670   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3671   match(Set dst (ReplicateF (LoadF mem)));
3672   format %{ "pshufd  $dst,$mem,0x00\n\t"
3673             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3674   ins_encode %{
3675     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3676     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3677   %}
3678   ins_pipe( pipe_slow );
3679 %}
3680 
3681 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3682   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3683   match(Set dst (ReplicateF zero));
3684   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3685   ins_encode %{
3686     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3687   %}
3688   ins_pipe( fpu_reg_reg );
3689 %}
3690 
3691 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3692   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3693   match(Set dst (ReplicateF zero));
3694   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3695   ins_encode %{
3696     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3697   %}
3698   ins_pipe( fpu_reg_reg );
3699 %}
3700 
3701 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3702   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3703   match(Set dst (ReplicateF zero));
3704   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3705   ins_encode %{
3706     int vector_len = 1;
3707     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3708   %}
3709   ins_pipe( fpu_reg_reg );
3710 %}
3711 
3712 instruct Repl2D_mem(vecX dst, memory mem) %{
3713   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3714   match(Set dst (ReplicateD (LoadD mem)));
3715   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3716   ins_encode %{
3717     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3718   %}
3719   ins_pipe( pipe_slow );
3720 %}
3721 
3722 instruct Repl4D(vecY dst, regD src) %{
3723   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3724   match(Set dst (ReplicateD src));
3725   format %{ "pshufd  $dst,$src,0x44\n\t"
3726             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3727   ins_encode %{
3728     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3729     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3730   %}
3731   ins_pipe( pipe_slow );
3732 %}
3733 
3734 instruct Repl4D_mem(vecY dst, memory mem) %{
3735   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3736   match(Set dst (ReplicateD (LoadD mem)));
3737   format %{ "pshufd  $dst,$mem,0x44\n\t"
3738             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3739   ins_encode %{
3740     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3741     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3742   %}
3743   ins_pipe( pipe_slow );
3744 %}
3745 
3746 // Replicate double (8 byte) scalar zero to be vector
3747 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3748   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3749   match(Set dst (ReplicateD zero));
3750   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3751   ins_encode %{
3752     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3753   %}
3754   ins_pipe( fpu_reg_reg );
3755 %}
3756 
3757 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3758   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3759   match(Set dst (ReplicateD zero));
3760   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3761   ins_encode %{
3762     int vector_len = 1;
3763     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3764   %}
3765   ins_pipe( fpu_reg_reg );
3766 %}
3767 
3768 // ====================GENERIC REPLICATE==========================================
3769 
3770 // Replicate byte scalar to be vector
3771 instruct Repl4B(vecS dst, rRegI src) %{
3772   predicate(n->as_Vector()->length() == 4);
3773   match(Set dst (ReplicateB src));
3774   format %{ "movd    $dst,$src\n\t"
3775             "punpcklbw $dst,$dst\n\t"
3776             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3777   ins_encode %{
3778     __ movdl($dst$$XMMRegister, $src$$Register);
3779     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3780     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3781   %}
3782   ins_pipe( pipe_slow );
3783 %}
3784 
3785 instruct Repl8B(vecD dst, rRegI src) %{
3786   predicate(n->as_Vector()->length() == 8);
3787   match(Set dst (ReplicateB src));
3788   format %{ "movd    $dst,$src\n\t"
3789             "punpcklbw $dst,$dst\n\t"
3790             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3791   ins_encode %{
3792     __ movdl($dst$$XMMRegister, $src$$Register);
3793     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3794     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3795   %}
3796   ins_pipe( pipe_slow );
3797 %}
3798 
3799 // Replicate byte scalar immediate to be vector by loading from const table.
3800 instruct Repl4B_imm(vecS dst, immI con) %{
3801   predicate(n->as_Vector()->length() == 4);
3802   match(Set dst (ReplicateB con));
3803   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3804   ins_encode %{
3805     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3806   %}
3807   ins_pipe( pipe_slow );
3808 %}
3809 
3810 instruct Repl8B_imm(vecD dst, immI con) %{
3811   predicate(n->as_Vector()->length() == 8);
3812   match(Set dst (ReplicateB con));
3813   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3814   ins_encode %{
3815     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3816   %}
3817   ins_pipe( pipe_slow );
3818 %}
3819 
3820 // Replicate byte scalar zero to be vector
3821 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3822   predicate(n->as_Vector()->length() == 4);
3823   match(Set dst (ReplicateB zero));
3824   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3825   ins_encode %{
3826     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3827   %}
3828   ins_pipe( fpu_reg_reg );
3829 %}
3830 
3831 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3832   predicate(n->as_Vector()->length() == 8);
3833   match(Set dst (ReplicateB zero));
3834   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3835   ins_encode %{
3836     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3837   %}
3838   ins_pipe( fpu_reg_reg );
3839 %}
3840 
3841 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3842   predicate(n->as_Vector()->length() == 16);
3843   match(Set dst (ReplicateB zero));
3844   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3845   ins_encode %{
3846     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3847   %}
3848   ins_pipe( fpu_reg_reg );
3849 %}
3850 
3851 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3852   predicate(n->as_Vector()->length() == 32);
3853   match(Set dst (ReplicateB zero));
3854   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3855   ins_encode %{
3856     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3857     int vector_len = 1;
3858     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3859   %}
3860   ins_pipe( fpu_reg_reg );
3861 %}
3862 
3863 // Replicate char/short (2 byte) scalar to be vector
3864 instruct Repl2S(vecS dst, rRegI src) %{
3865   predicate(n->as_Vector()->length() == 2);
3866   match(Set dst (ReplicateS src));
3867   format %{ "movd    $dst,$src\n\t"
3868             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3869   ins_encode %{
3870     __ movdl($dst$$XMMRegister, $src$$Register);
3871     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3872   %}
3873   ins_pipe( fpu_reg_reg );
3874 %}
3875 
3876 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3877 instruct Repl2S_imm(vecS dst, immI con) %{
3878   predicate(n->as_Vector()->length() == 2);
3879   match(Set dst (ReplicateS con));
3880   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3881   ins_encode %{
3882     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3883   %}
3884   ins_pipe( fpu_reg_reg );
3885 %}
3886 
3887 instruct Repl4S_imm(vecD dst, immI con) %{
3888   predicate(n->as_Vector()->length() == 4);
3889   match(Set dst (ReplicateS con));
3890   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3891   ins_encode %{
3892     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3893   %}
3894   ins_pipe( fpu_reg_reg );
3895 %}
3896 
3897 // Replicate char/short (2 byte) scalar zero to be vector
3898 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3899   predicate(n->as_Vector()->length() == 2);
3900   match(Set dst (ReplicateS zero));
3901   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3902   ins_encode %{
3903     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3904   %}
3905   ins_pipe( fpu_reg_reg );
3906 %}
3907 
3908 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3909   predicate(n->as_Vector()->length() == 4);
3910   match(Set dst (ReplicateS zero));
3911   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3912   ins_encode %{
3913     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3914   %}
3915   ins_pipe( fpu_reg_reg );
3916 %}
3917 
3918 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3919   predicate(n->as_Vector()->length() == 8);
3920   match(Set dst (ReplicateS zero));
3921   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3922   ins_encode %{
3923     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3924   %}
3925   ins_pipe( fpu_reg_reg );
3926 %}
3927 
3928 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3929   predicate(n->as_Vector()->length() == 16);
3930   match(Set dst (ReplicateS zero));
3931   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3932   ins_encode %{
3933     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3934     int vector_len = 1;
3935     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3936   %}
3937   ins_pipe( fpu_reg_reg );
3938 %}
3939 
3940 // Replicate integer (4 byte) scalar to be vector
3941 instruct Repl2I(vecD dst, rRegI src) %{
3942   predicate(n->as_Vector()->length() == 2);
3943   match(Set dst (ReplicateI src));
3944   format %{ "movd    $dst,$src\n\t"
3945             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3946   ins_encode %{
3947     __ movdl($dst$$XMMRegister, $src$$Register);
3948     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3949   %}
3950   ins_pipe( fpu_reg_reg );
3951 %}
3952 
3953 // Integer could be loaded into xmm register directly from memory.
3954 instruct Repl2I_mem(vecD dst, memory mem) %{
3955   predicate(n->as_Vector()->length() == 2);
3956   match(Set dst (ReplicateI (LoadI mem)));
3957   format %{ "movd    $dst,$mem\n\t"
3958             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3959   ins_encode %{
3960     __ movdl($dst$$XMMRegister, $mem$$Address);
3961     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3962   %}
3963   ins_pipe( fpu_reg_reg );
3964 %}
3965 
3966 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3967 instruct Repl2I_imm(vecD dst, immI con) %{
3968   predicate(n->as_Vector()->length() == 2);
3969   match(Set dst (ReplicateI con));
3970   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3971   ins_encode %{
3972     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3973   %}
3974   ins_pipe( fpu_reg_reg );
3975 %}
3976 
3977 // Replicate integer (4 byte) scalar zero to be vector
3978 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3979   predicate(n->as_Vector()->length() == 2);
3980   match(Set dst (ReplicateI zero));
3981   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3982   ins_encode %{
3983     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3984   %}
3985   ins_pipe( fpu_reg_reg );
3986 %}
3987 
3988 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3989   predicate(n->as_Vector()->length() == 4);
3990   match(Set dst (ReplicateI zero));
3991   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3992   ins_encode %{
3993     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3994   %}
3995   ins_pipe( fpu_reg_reg );
3996 %}
3997 
3998 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3999   predicate(n->as_Vector()->length() == 8);
4000   match(Set dst (ReplicateI zero));
4001   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4002   ins_encode %{
4003     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4004     int vector_len = 1;
4005     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4006   %}
4007   ins_pipe( fpu_reg_reg );
4008 %}
4009 
4010 // Replicate long (8 byte) scalar to be vector
4011 #ifdef _LP64
4012 instruct Repl2L(vecX dst, rRegL src) %{
4013   predicate(n->as_Vector()->length() == 2);
4014   match(Set dst (ReplicateL src));
4015   format %{ "movdq   $dst,$src\n\t"
4016             "punpcklqdq $dst,$dst\t! replicate2L" %}
4017   ins_encode %{
4018     __ movdq($dst$$XMMRegister, $src$$Register);
4019     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4020   %}
4021   ins_pipe( pipe_slow );
4022 %}
4023 #else // _LP64
4024 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
4025   predicate(n->as_Vector()->length() == 2);
4026   match(Set dst (ReplicateL src));
4027   effect(TEMP dst, USE src, TEMP tmp);
4028   format %{ "movdl   $dst,$src.lo\n\t"
4029             "movdl   $tmp,$src.hi\n\t"
4030             "punpckldq $dst,$tmp\n\t"
4031             "punpcklqdq $dst,$dst\t! replicate2L"%}
4032   ins_encode %{
4033     __ movdl($dst$$XMMRegister, $src$$Register);
4034     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4035     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4036     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4037   %}
4038   ins_pipe( pipe_slow );
4039 %}
4040 #endif // _LP64
4041 
4042 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4043 instruct Repl2L_imm(vecX dst, immL con) %{
4044   predicate(n->as_Vector()->length() == 2);
4045   match(Set dst (ReplicateL con));
4046   format %{ "movq    $dst,[$constantaddress]\n\t"
4047             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4048   ins_encode %{
4049     __ movq($dst$$XMMRegister, $constantaddress($con));
4050     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4051   %}
4052   ins_pipe( pipe_slow );
4053 %}
4054 
4055 // Replicate long (8 byte) scalar zero to be vector
4056 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4057   predicate(n->as_Vector()->length() == 2);
4058   match(Set dst (ReplicateL zero));
4059   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4060   ins_encode %{
4061     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4062   %}
4063   ins_pipe( fpu_reg_reg );
4064 %}
4065 
4066 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4067   predicate(n->as_Vector()->length() == 4);
4068   match(Set dst (ReplicateL zero));
4069   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4070   ins_encode %{
4071     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4072     int vector_len = 1;
4073     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4074   %}
4075   ins_pipe( fpu_reg_reg );
4076 %}
4077 
4078 // Replicate float (4 byte) scalar to be vector
4079 instruct Repl2F(vecD dst, regF src) %{
4080   predicate(n->as_Vector()->length() == 2);
4081   match(Set dst (ReplicateF src));
4082   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4083   ins_encode %{
4084     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4085   %}
4086   ins_pipe( fpu_reg_reg );
4087 %}
4088 
4089 instruct Repl4F(vecX dst, regF src) %{
4090   predicate(n->as_Vector()->length() == 4);
4091   match(Set dst (ReplicateF src));
4092   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4093   ins_encode %{
4094     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4095   %}
4096   ins_pipe( pipe_slow );
4097 %}
4098 
4099 // Replicate double (8 bytes) scalar to be vector
4100 instruct Repl2D(vecX dst, regD src) %{
4101   predicate(n->as_Vector()->length() == 2);
4102   match(Set dst (ReplicateD src));
4103   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4104   ins_encode %{
4105     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4106   %}
4107   ins_pipe( pipe_slow );
4108 %}
4109 
4110 // ====================EVEX REPLICATE=============================================
4111 
4112 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4113   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
4114   match(Set dst (ReplicateB (LoadB mem)));
4115   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4116   ins_encode %{
4117     int vector_len = 0;
4118     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4119   %}
4120   ins_pipe( pipe_slow );
4121 %}
4122 
4123 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4124   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4125   match(Set dst (ReplicateB (LoadB mem)));
4126   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4127   ins_encode %{
4128     int vector_len = 0;
4129     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4130   %}
4131   ins_pipe( pipe_slow );
4132 %}
4133 
4134 instruct Repl16B_evex(vecX dst, rRegI src) %{
4135   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4136   match(Set dst (ReplicateB src));
4137   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
4138   ins_encode %{
4139    int vector_len = 0;
4140     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4141   %}
4142   ins_pipe( pipe_slow );
4143 %}
4144 
4145 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4146   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4147   match(Set dst (ReplicateB (LoadB mem)));
4148   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4149   ins_encode %{
4150     int vector_len = 0;
4151     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4152   %}
4153   ins_pipe( pipe_slow );
4154 %}
4155 
4156 instruct Repl32B_evex(vecY dst, rRegI src) %{
4157   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
4158   match(Set dst (ReplicateB src));
4159   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
4160   ins_encode %{
4161    int vector_len = 1;
4162     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4163   %}
4164   ins_pipe( pipe_slow );
4165 %}
4166 
4167 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4168   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
4169   match(Set dst (ReplicateB (LoadB mem)));
4170   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4171   ins_encode %{
4172     int vector_len = 1;
4173     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4174   %}
4175   ins_pipe( pipe_slow );
4176 %}
4177 
4178 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4179   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
4180   match(Set dst (ReplicateB src));
4181   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
4182   ins_encode %{
4183    int vector_len = 2;
4184     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4185   %}
4186   ins_pipe( pipe_slow );
4187 %}
4188 
4189 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4190   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
4191   match(Set dst (ReplicateB (LoadB mem)));
4192   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4193   ins_encode %{
4194     int vector_len = 2;
4195     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4196   %}
4197   ins_pipe( pipe_slow );
4198 %}
4199 
4200 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4201   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4202   match(Set dst (ReplicateB con));
4203   format %{ "movq    $dst,[$constantaddress]\n\t"
4204             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4205   ins_encode %{
4206    int vector_len = 0;
4207     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4208     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4209   %}
4210   ins_pipe( pipe_slow );
4211 %}
4212 
4213 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4214   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
4215   match(Set dst (ReplicateB con));
4216   format %{ "movq    $dst,[$constantaddress]\n\t"
4217             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4218   ins_encode %{
4219    int vector_len = 1;
4220     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4221     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4222   %}
4223   ins_pipe( pipe_slow );
4224 %}
4225 
4226 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4227   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
4228   match(Set dst (ReplicateB con));
4229   format %{ "movq    $dst,[$constantaddress]\n\t"
4230             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4231   ins_encode %{
4232    int vector_len = 2;
4233     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4234     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4235   %}
4236   ins_pipe( pipe_slow );
4237 %}
4238 
4239 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4240   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4241   match(Set dst (ReplicateB zero));
4242   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4243   ins_encode %{
4244     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4245     int vector_len = 2;
4246     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4247   %}
4248   ins_pipe( fpu_reg_reg );
4249 %}
4250 
4251 instruct Repl4S_evex(vecD dst, rRegI src) %{
4252   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
4253   match(Set dst (ReplicateS src));
4254   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
4255   ins_encode %{
4256    int vector_len = 0;
4257     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4258   %}
4259   ins_pipe( pipe_slow );
4260 %}
4261 
4262 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4263   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
4264   match(Set dst (ReplicateS (LoadS mem)));
4265   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4266   ins_encode %{
4267     int vector_len = 0;
4268     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4269   %}
4270   ins_pipe( pipe_slow );
4271 %}
4272 
4273 instruct Repl8S_evex(vecX dst, rRegI src) %{
4274   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4275   match(Set dst (ReplicateS src));
4276   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
4277   ins_encode %{
4278    int vector_len = 0;
4279     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4280   %}
4281   ins_pipe( pipe_slow );
4282 %}
4283 
4284 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4285   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4286   match(Set dst (ReplicateS (LoadS mem)));
4287   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4288   ins_encode %{
4289     int vector_len = 0;
4290     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4291   %}
4292   ins_pipe( pipe_slow );
4293 %}
4294 
4295 instruct Repl16S_evex(vecY dst, rRegI src) %{
4296   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4297   match(Set dst (ReplicateS src));
4298   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
4299   ins_encode %{
4300    int vector_len = 1;
4301     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4302   %}
4303   ins_pipe( pipe_slow );
4304 %}
4305 
4306 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4307   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4308   match(Set dst (ReplicateS (LoadS mem)));
4309   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4310   ins_encode %{
4311     int vector_len = 1;
4312     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4313   %}
4314   ins_pipe( pipe_slow );
4315 %}
4316 
4317 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4318   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4319   match(Set dst (ReplicateS src));
4320   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
4321   ins_encode %{
4322    int vector_len = 2;
4323     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4324   %}
4325   ins_pipe( pipe_slow );
4326 %}
4327 
4328 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4329   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4330   match(Set dst (ReplicateS (LoadS mem)));
4331   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4332   ins_encode %{
4333     int vector_len = 2;
4334     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4335   %}
4336   ins_pipe( pipe_slow );
4337 %}
4338 
4339 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4340   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4341   match(Set dst (ReplicateS con));
4342   format %{ "movq    $dst,[$constantaddress]\n\t"
4343             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4344   ins_encode %{
4345    int vector_len = 0;
4346     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4347     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4348   %}
4349   ins_pipe( pipe_slow );
4350 %}
4351 
4352 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4353   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4354   match(Set dst (ReplicateS con));
4355   format %{ "movq    $dst,[$constantaddress]\n\t"
4356             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4357   ins_encode %{
4358    int vector_len = 1;
4359     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4360     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4361   %}
4362   ins_pipe( pipe_slow );
4363 %}
4364 
4365 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4366   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4367   match(Set dst (ReplicateS con));
4368   format %{ "movq    $dst,[$constantaddress]\n\t"
4369             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4370   ins_encode %{
4371    int vector_len = 2;
4372     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4373     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4374   %}
4375   ins_pipe( pipe_slow );
4376 %}
4377 
4378 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4379   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4380   match(Set dst (ReplicateS zero));
4381   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4382   ins_encode %{
4383     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4384     int vector_len = 2;
4385     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4386   %}
4387   ins_pipe( fpu_reg_reg );
4388 %}
4389 
4390 instruct Repl4I_evex(vecX dst, rRegI src) %{
4391   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4392   match(Set dst (ReplicateI src));
4393   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
4394   ins_encode %{
4395     int vector_len = 0;
4396     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4397   %}
4398   ins_pipe( pipe_slow );
4399 %}
4400 
4401 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4402   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4403   match(Set dst (ReplicateI (LoadI mem)));
4404   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4405   ins_encode %{
4406     int vector_len = 0;
4407     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4408   %}
4409   ins_pipe( pipe_slow );
4410 %}
4411 
4412 instruct Repl8I_evex(vecY dst, rRegI src) %{
4413   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4414   match(Set dst (ReplicateI src));
4415   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
4416   ins_encode %{
4417     int vector_len = 1;
4418     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4419   %}
4420   ins_pipe( pipe_slow );
4421 %}
4422 
4423 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4424   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4425   match(Set dst (ReplicateI (LoadI mem)));
4426   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4427   ins_encode %{
4428     int vector_len = 1;
4429     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4430   %}
4431   ins_pipe( pipe_slow );
4432 %}
4433 
4434 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4435   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4436   match(Set dst (ReplicateI src));
4437   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4438   ins_encode %{
4439     int vector_len = 2;
4440     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4441   %}
4442   ins_pipe( pipe_slow );
4443 %}
4444 
4445 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4446   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4447   match(Set dst (ReplicateI (LoadI mem)));
4448   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4449   ins_encode %{
4450     int vector_len = 2;
4451     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4452   %}
4453   ins_pipe( pipe_slow );
4454 %}
4455 
4456 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4457   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4458   match(Set dst (ReplicateI con));
4459   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4460             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4461   ins_encode %{
4462     int vector_len = 0;
4463     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4464     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4465   %}
4466   ins_pipe( pipe_slow );
4467 %}
4468 
4469 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4470   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4471   match(Set dst (ReplicateI con));
4472   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4473             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4474   ins_encode %{
4475     int vector_len = 1;
4476     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4477     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4478   %}
4479   ins_pipe( pipe_slow );
4480 %}
4481 
4482 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4483   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4484   match(Set dst (ReplicateI con));
4485   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4486             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4487   ins_encode %{
4488     int vector_len = 2;
4489     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4490     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4491   %}
4492   ins_pipe( pipe_slow );
4493 %}
4494 
4495 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4496   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4497   match(Set dst (ReplicateI zero));
4498   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4499   ins_encode %{
4500     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4501     int vector_len = 2;
4502     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4503   %}
4504   ins_pipe( fpu_reg_reg );
4505 %}
4506 
4507 // Replicate long (8 byte) scalar to be vector
4508 #ifdef _LP64
4509 instruct Repl4L_evex(vecY dst, rRegL src) %{
4510   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4511   match(Set dst (ReplicateL src));
4512   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4513   ins_encode %{
4514     int vector_len = 1;
4515     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4516   %}
4517   ins_pipe( pipe_slow );
4518 %}
4519 
4520 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4521   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4522   match(Set dst (ReplicateL src));
4523   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4524   ins_encode %{
4525     int vector_len = 2;
4526     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4527   %}
4528   ins_pipe( pipe_slow );
4529 %}
4530 #else // _LP64
4531 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4532   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4533   match(Set dst (ReplicateL src));
4534   effect(TEMP dst, USE src, TEMP tmp);
4535   format %{ "movdl   $dst,$src.lo\n\t"
4536             "movdl   $tmp,$src.hi\n\t"
4537             "punpckldq $dst,$tmp\n\t"
4538             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4539   ins_encode %{
4540     int vector_len = 1;
4541     __ movdl($dst$$XMMRegister, $src$$Register);
4542     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4543     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4544     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4545   %}
4546   ins_pipe( pipe_slow );
4547 %}
4548 
4549 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4550   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4551   match(Set dst (ReplicateL src));
4552   effect(TEMP dst, USE src, TEMP tmp);
4553   format %{ "movdl   $dst,$src.lo\n\t"
4554             "movdl   $tmp,$src.hi\n\t"
4555             "punpckldq $dst,$tmp\n\t"
4556             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4557   ins_encode %{
4558     int vector_len = 2;
4559     __ movdl($dst$$XMMRegister, $src$$Register);
4560     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4561     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4562     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4563   %}
4564   ins_pipe( pipe_slow );
4565 %}
4566 #endif // _LP64
4567 
4568 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4569   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4570   match(Set dst (ReplicateL con));
4571   format %{ "movq    $dst,[$constantaddress]\n\t"
4572             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4573   ins_encode %{
4574     int vector_len = 1;
4575     __ movq($dst$$XMMRegister, $constantaddress($con));
4576     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4577   %}
4578   ins_pipe( pipe_slow );
4579 %}
4580 
4581 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4582   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4583   match(Set dst (ReplicateL con));
4584   format %{ "movq    $dst,[$constantaddress]\n\t"
4585             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4586   ins_encode %{
4587     int vector_len = 2;
4588     __ movq($dst$$XMMRegister, $constantaddress($con));
4589     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4590   %}
4591   ins_pipe( pipe_slow );
4592 %}
4593 
4594 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4595   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4596   match(Set dst (ReplicateL (LoadL mem)));
4597   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4598   ins_encode %{
4599     int vector_len = 0;
4600     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4601   %}
4602   ins_pipe( pipe_slow );
4603 %}
4604 
4605 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4606   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4607   match(Set dst (ReplicateL (LoadL mem)));
4608   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4609   ins_encode %{
4610     int vector_len = 1;
4611     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4612   %}
4613   ins_pipe( pipe_slow );
4614 %}
4615 
4616 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4617   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4618   match(Set dst (ReplicateL (LoadL mem)));
4619   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4620   ins_encode %{
4621     int vector_len = 2;
4622     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4623   %}
4624   ins_pipe( pipe_slow );
4625 %}
4626 
4627 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4628   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4629   match(Set dst (ReplicateL zero));
4630   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4631   ins_encode %{
4632     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4633     int vector_len = 2;
4634     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4635   %}
4636   ins_pipe( fpu_reg_reg );
4637 %}
4638 
4639 instruct Repl8F_evex(vecY dst, regF src) %{
4640   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4641   match(Set dst (ReplicateF src));
4642   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4643   ins_encode %{
4644     int vector_len = 1;
4645     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4646   %}
4647   ins_pipe( pipe_slow );
4648 %}
4649 
4650 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4651   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4652   match(Set dst (ReplicateF (LoadF mem)));
4653   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4654   ins_encode %{
4655     int vector_len = 1;
4656     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4657   %}
4658   ins_pipe( pipe_slow );
4659 %}
4660 
4661 instruct Repl16F_evex(vecZ dst, regF src) %{
4662   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4663   match(Set dst (ReplicateF src));
4664   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
4665   ins_encode %{
4666     int vector_len = 2;
4667     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4668   %}
4669   ins_pipe( pipe_slow );
4670 %}
4671 
4672 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4673   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4674   match(Set dst (ReplicateF (LoadF mem)));
4675   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4676   ins_encode %{
4677     int vector_len = 2;
4678     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4679   %}
4680   ins_pipe( pipe_slow );
4681 %}
4682 
4683 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4684   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4685   match(Set dst (ReplicateF zero));
4686   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4687   ins_encode %{
4688     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4689     int vector_len = 2;
4690     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4691   %}
4692   ins_pipe( fpu_reg_reg );
4693 %}
4694 
4695 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4696   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4697   match(Set dst (ReplicateF zero));
4698   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4699   ins_encode %{
4700     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4701     int vector_len = 2;
4702     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4703   %}
4704   ins_pipe( fpu_reg_reg );
4705 %}
4706 
4707 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4708   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4709   match(Set dst (ReplicateF zero));
4710   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4711   ins_encode %{
4712     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4713     int vector_len = 2;
4714     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4715   %}
4716   ins_pipe( fpu_reg_reg );
4717 %}
4718 
4719 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4720   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4721   match(Set dst (ReplicateF zero));
4722   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4723   ins_encode %{
4724     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4725     int vector_len = 2;
4726     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4727   %}
4728   ins_pipe( fpu_reg_reg );
4729 %}
4730 
4731 instruct Repl4D_evex(vecY dst, regD src) %{
4732   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4733   match(Set dst (ReplicateD src));
4734   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4735   ins_encode %{
4736     int vector_len = 1;
4737     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4738   %}
4739   ins_pipe( pipe_slow );
4740 %}
4741 
4742 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4743   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4744   match(Set dst (ReplicateD (LoadD mem)));
4745   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4746   ins_encode %{
4747     int vector_len = 1;
4748     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4749   %}
4750   ins_pipe( pipe_slow );
4751 %}
4752 
4753 instruct Repl8D_evex(vecZ dst, regD src) %{
4754   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4755   match(Set dst (ReplicateD src));
4756   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
4757   ins_encode %{
4758     int vector_len = 2;
4759     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4760   %}
4761   ins_pipe( pipe_slow );
4762 %}
4763 
4764 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4765   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4766   match(Set dst (ReplicateD (LoadD mem)));
4767   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4768   ins_encode %{
4769     int vector_len = 2;
4770     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4776   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4777   match(Set dst (ReplicateD zero));
4778   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4779   ins_encode %{
4780     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4781     int vector_len = 2;
4782     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4783   %}
4784   ins_pipe( fpu_reg_reg );
4785 %}
4786 
4787 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4788   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4789   match(Set dst (ReplicateD zero));
4790   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4791   ins_encode %{
4792     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4793     int vector_len = 2;
4794     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4795   %}
4796   ins_pipe( fpu_reg_reg );
4797 %}
4798 
4799 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4800   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4801   match(Set dst (ReplicateD zero));
4802   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4803   ins_encode %{
4804     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4805     int vector_len = 2;
4806     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4807   %}
4808   ins_pipe( fpu_reg_reg );
4809 %}
4810 
4811 // ====================REDUCTION ARITHMETIC=======================================
4812 
4813 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4814   predicate(UseSSE > 2 && UseAVX == 0);
4815   match(Set dst (AddReductionVI src1 src2));
4816   effect(TEMP tmp2, TEMP tmp);
4817   format %{ "movdqu  $tmp2,$src2\n\t"
4818             "phaddd  $tmp2,$tmp2\n\t"
4819             "movd    $tmp,$src1\n\t"
4820             "paddd   $tmp,$tmp2\n\t"
4821             "movd    $dst,$tmp\t! add reduction2I" %}
4822   ins_encode %{
4823     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4824     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4825     __ movdl($tmp$$XMMRegister, $src1$$Register);
4826     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4827     __ movdl($dst$$Register, $tmp$$XMMRegister);
4828   %}
4829   ins_pipe( pipe_slow );
4830 %}
4831 
4832 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4833   predicate(VM_Version::supports_avxonly());
4834   match(Set dst (AddReductionVI src1 src2));
4835   effect(TEMP tmp, TEMP tmp2);
4836   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4837             "movd     $tmp2,$src1\n\t"
4838             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4839             "movd     $dst,$tmp2\t! add reduction2I" %}
4840   ins_encode %{
4841     int vector_len = 0;
4842     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4843     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4844     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4845     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4846   %}
4847   ins_pipe( pipe_slow );
4848 %}
4849 
4850 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4851   predicate(UseAVX > 2);
4852   match(Set dst (AddReductionVI src1 src2));
4853   effect(TEMP tmp, TEMP tmp2);
4854   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4855             "vpaddd  $tmp,$src2,$tmp2\n\t"
4856             "movd    $tmp2,$src1\n\t"
4857             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4858             "movd    $dst,$tmp2\t! add reduction2I" %}
4859   ins_encode %{
4860     int vector_len = 0;
4861     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4862     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4863     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4864     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4865     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4866   %}
4867   ins_pipe( pipe_slow );
4868 %}
4869 
4870 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4871   predicate(UseSSE > 2 && UseAVX == 0);
4872   match(Set dst (AddReductionVI src1 src2));
4873   effect(TEMP tmp, TEMP tmp2);
4874   format %{ "movdqu  $tmp,$src2\n\t"
4875             "phaddd  $tmp,$tmp\n\t"
4876             "phaddd  $tmp,$tmp\n\t"
4877             "movd    $tmp2,$src1\n\t"
4878             "paddd   $tmp2,$tmp\n\t"
4879             "movd    $dst,$tmp2\t! add reduction4I" %}
4880   ins_encode %{
4881     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4882     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4883     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4884     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4885     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4886     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4887   %}
4888   ins_pipe( pipe_slow );
4889 %}
4890 
4891 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4892   predicate(VM_Version::supports_avxonly());
4893   match(Set dst (AddReductionVI src1 src2));
4894   effect(TEMP tmp, TEMP tmp2);
4895   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4896             "vphaddd  $tmp,$tmp,$tmp\n\t"
4897             "movd     $tmp2,$src1\n\t"
4898             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4899             "movd     $dst,$tmp2\t! add reduction4I" %}
4900   ins_encode %{
4901     int vector_len = 0;
4902     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4903     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4904     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4905     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4906     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4907   %}
4908   ins_pipe( pipe_slow );
4909 %}
4910 
4911 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4912   predicate(UseAVX > 2);
4913   match(Set dst (AddReductionVI src1 src2));
4914   effect(TEMP tmp, TEMP tmp2);
4915   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4916             "vpaddd  $tmp,$src2,$tmp2\n\t"
4917             "pshufd  $tmp2,$tmp,0x1\n\t"
4918             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4919             "movd    $tmp2,$src1\n\t"
4920             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4921             "movd    $dst,$tmp2\t! add reduction4I" %}
4922   ins_encode %{
4923     int vector_len = 0;
4924     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4925     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4926     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4927     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4928     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4929     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4930     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4931   %}
4932   ins_pipe( pipe_slow );
4933 %}
4934 
4935 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4936   predicate(VM_Version::supports_avxonly());
4937   match(Set dst (AddReductionVI src1 src2));
4938   effect(TEMP tmp, TEMP tmp2);
4939   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4940             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4941             "vextracti128_high  $tmp2,$tmp\n\t"
4942             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4943             "movd     $tmp2,$src1\n\t"
4944             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4945             "movd     $dst,$tmp2\t! add reduction8I" %}
4946   ins_encode %{
4947     int vector_len = 1;
4948     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4949     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4950     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4951     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4952     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4953     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4954     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4955   %}
4956   ins_pipe( pipe_slow );
4957 %}
4958 
4959 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4960   predicate(UseAVX > 2);
4961   match(Set dst (AddReductionVI src1 src2));
4962   effect(TEMP tmp, TEMP tmp2);
4963   format %{ "vextracti128_high  $tmp,$src2\n\t"
4964             "vpaddd  $tmp,$tmp,$src2\n\t"
4965             "pshufd  $tmp2,$tmp,0xE\n\t"
4966             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4967             "pshufd  $tmp2,$tmp,0x1\n\t"
4968             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4969             "movd    $tmp2,$src1\n\t"
4970             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4971             "movd    $dst,$tmp2\t! add reduction8I" %}
4972   ins_encode %{
4973     int vector_len = 0;
4974     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4975     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4976     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4977     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4978     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4979     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4980     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4981     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4982     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4983   %}
4984   ins_pipe( pipe_slow );
4985 %}
4986 
4987 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4988   predicate(UseAVX > 2);
4989   match(Set dst (AddReductionVI src1 src2));
4990   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4991   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4992             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4993             "vextracti128_high  $tmp,$tmp3\n\t"
4994             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4995             "pshufd  $tmp2,$tmp,0xE\n\t"
4996             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4997             "pshufd  $tmp2,$tmp,0x1\n\t"
4998             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4999             "movd    $tmp2,$src1\n\t"
5000             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5001             "movd    $dst,$tmp2\t! mul reduction16I" %}
5002   ins_encode %{
5003     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5004     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5005     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5006     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5007     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5008     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5009     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5010     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5011     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5012     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5013     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5014   %}
5015   ins_pipe( pipe_slow );
5016 %}
5017 
5018 #ifdef _LP64
5019 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5020   predicate(UseAVX > 2);
5021   match(Set dst (AddReductionVL src1 src2));
5022   effect(TEMP tmp, TEMP tmp2);
5023   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5024             "vpaddq  $tmp,$src2,$tmp2\n\t"
5025             "movdq   $tmp2,$src1\n\t"
5026             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5027             "movdq   $dst,$tmp2\t! add reduction2L" %}
5028   ins_encode %{
5029     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5030     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5031     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5032     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5033     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5034   %}
5035   ins_pipe( pipe_slow );
5036 %}
5037 
5038 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5039   predicate(UseAVX > 2);
5040   match(Set dst (AddReductionVL src1 src2));
5041   effect(TEMP tmp, TEMP tmp2);
5042   format %{ "vextracti128_high  $tmp,$src2\n\t"
5043             "vpaddq  $tmp2,$tmp,$src2\n\t"
5044             "pshufd  $tmp,$tmp2,0xE\n\t"
5045             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5046             "movdq   $tmp,$src1\n\t"
5047             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5048             "movdq   $dst,$tmp2\t! add reduction4L" %}
5049   ins_encode %{
5050     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5051     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5052     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5053     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5054     __ movdq($tmp$$XMMRegister, $src1$$Register);
5055     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5056     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5057   %}
5058   ins_pipe( pipe_slow );
5059 %}
5060 
5061 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5062   predicate(UseAVX > 2);
5063   match(Set dst (AddReductionVL src1 src2));
5064   effect(TEMP tmp, TEMP tmp2);
5065   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5066             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5067             "vextracti128_high  $tmp,$tmp2\n\t"
5068             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5069             "pshufd  $tmp,$tmp2,0xE\n\t"
5070             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5071             "movdq   $tmp,$src1\n\t"
5072             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5073             "movdq   $dst,$tmp2\t! add reduction8L" %}
5074   ins_encode %{
5075     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5076     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5077     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5078     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5079     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5080     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5081     __ movdq($tmp$$XMMRegister, $src1$$Register);
5082     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5083     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5084   %}
5085   ins_pipe( pipe_slow );
5086 %}
5087 #endif
5088 
5089 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5090   predicate(UseSSE >= 1 && UseAVX == 0);
5091   match(Set dst (AddReductionVF dst src2));
5092   effect(TEMP dst, TEMP tmp);
5093   format %{ "addss   $dst,$src2\n\t"
5094             "pshufd  $tmp,$src2,0x01\n\t"
5095             "addss   $dst,$tmp\t! add reduction2F" %}
5096   ins_encode %{
5097     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5098     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5099     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5100   %}
5101   ins_pipe( pipe_slow );
5102 %}
5103 
5104 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5105   predicate(UseAVX > 0);
5106   match(Set dst (AddReductionVF dst src2));
5107   effect(TEMP dst, TEMP tmp);
5108   format %{ "vaddss  $dst,$dst,$src2\n\t"
5109             "pshufd  $tmp,$src2,0x01\n\t"
5110             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5111   ins_encode %{
5112     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5113     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5114     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5115   %}
5116   ins_pipe( pipe_slow );
5117 %}
5118 
5119 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5120   predicate(UseSSE >= 1 && UseAVX == 0);
5121   match(Set dst (AddReductionVF dst src2));
5122   effect(TEMP dst, TEMP tmp);
5123   format %{ "addss   $dst,$src2\n\t"
5124             "pshufd  $tmp,$src2,0x01\n\t"
5125             "addss   $dst,$tmp\n\t"
5126             "pshufd  $tmp,$src2,0x02\n\t"
5127             "addss   $dst,$tmp\n\t"
5128             "pshufd  $tmp,$src2,0x03\n\t"
5129             "addss   $dst,$tmp\t! add reduction4F" %}
5130   ins_encode %{
5131     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5132     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5133     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5134     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5135     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5136     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5137     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5138   %}
5139   ins_pipe( pipe_slow );
5140 %}
5141 
5142 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5143   predicate(UseAVX > 0);
5144   match(Set dst (AddReductionVF dst src2));
5145   effect(TEMP tmp, TEMP dst);
5146   format %{ "vaddss  $dst,dst,$src2\n\t"
5147             "pshufd  $tmp,$src2,0x01\n\t"
5148             "vaddss  $dst,$dst,$tmp\n\t"
5149             "pshufd  $tmp,$src2,0x02\n\t"
5150             "vaddss  $dst,$dst,$tmp\n\t"
5151             "pshufd  $tmp,$src2,0x03\n\t"
5152             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5153   ins_encode %{
5154     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5155     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5156     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5157     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5158     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5159     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5160     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5161   %}
5162   ins_pipe( pipe_slow );
5163 %}
5164 
5165 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5166   predicate(UseAVX > 0);
5167   match(Set dst (AddReductionVF dst src2));
5168   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5169   format %{ "vaddss  $dst,$dst,$src2\n\t"
5170             "pshufd  $tmp,$src2,0x01\n\t"
5171             "vaddss  $dst,$dst,$tmp\n\t"
5172             "pshufd  $tmp,$src2,0x02\n\t"
5173             "vaddss  $dst,$dst,$tmp\n\t"
5174             "pshufd  $tmp,$src2,0x03\n\t"
5175             "vaddss  $dst,$dst,$tmp\n\t"
5176             "vextractf128_high  $tmp2,$src2\n\t"
5177             "vaddss  $dst,$dst,$tmp2\n\t"
5178             "pshufd  $tmp,$tmp2,0x01\n\t"
5179             "vaddss  $dst,$dst,$tmp\n\t"
5180             "pshufd  $tmp,$tmp2,0x02\n\t"
5181             "vaddss  $dst,$dst,$tmp\n\t"
5182             "pshufd  $tmp,$tmp2,0x03\n\t"
5183             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5184   ins_encode %{
5185     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5186     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5187     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5188     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5189     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5190     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5191     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5192     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5193     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5194     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5195     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5196     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5197     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5198     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5199     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5200   %}
5201   ins_pipe( pipe_slow );
5202 %}
5203 
5204 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5205   predicate(UseAVX > 2);
5206   match(Set dst (AddReductionVF dst src2));
5207   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5208   format %{ "vaddss  $dst,$dst,$src2\n\t"
5209             "pshufd  $tmp,$src2,0x01\n\t"
5210             "vaddss  $dst,$dst,$tmp\n\t"
5211             "pshufd  $tmp,$src2,0x02\n\t"
5212             "vaddss  $dst,$dst,$tmp\n\t"
5213             "pshufd  $tmp,$src2,0x03\n\t"
5214             "vaddss  $dst,$dst,$tmp\n\t"
5215             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5216             "vaddss  $dst,$dst,$tmp2\n\t"
5217             "pshufd  $tmp,$tmp2,0x01\n\t"
5218             "vaddss  $dst,$dst,$tmp\n\t"
5219             "pshufd  $tmp,$tmp2,0x02\n\t"
5220             "vaddss  $dst,$dst,$tmp\n\t"
5221             "pshufd  $tmp,$tmp2,0x03\n\t"
5222             "vaddss  $dst,$dst,$tmp\n\t"
5223             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5224             "vaddss  $dst,$dst,$tmp2\n\t"
5225             "pshufd  $tmp,$tmp2,0x01\n\t"
5226             "vaddss  $dst,$dst,$tmp\n\t"
5227             "pshufd  $tmp,$tmp2,0x02\n\t"
5228             "vaddss  $dst,$dst,$tmp\n\t"
5229             "pshufd  $tmp,$tmp2,0x03\n\t"
5230             "vaddss  $dst,$dst,$tmp\n\t"
5231             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5232             "vaddss  $dst,$dst,$tmp2\n\t"
5233             "pshufd  $tmp,$tmp2,0x01\n\t"
5234             "vaddss  $dst,$dst,$tmp\n\t"
5235             "pshufd  $tmp,$tmp2,0x02\n\t"
5236             "vaddss  $dst,$dst,$tmp\n\t"
5237             "pshufd  $tmp,$tmp2,0x03\n\t"
5238             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5239   ins_encode %{
5240     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5241     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5242     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5243     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5244     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5245     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5246     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5247     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5248     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5249     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5250     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5251     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5252     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5253     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5254     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5255     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5256     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5257     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5258     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5259     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5260     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5261     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5262     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5263     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5264     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5265     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5266     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5267     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5268     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5269     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5270     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5271   %}
5272   ins_pipe( pipe_slow );
5273 %}
5274 
5275 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5276   predicate(UseSSE >= 1 && UseAVX == 0);
5277   match(Set dst (AddReductionVD dst src2));
5278   effect(TEMP tmp, TEMP dst);
5279   format %{ "addsd   $dst,$src2\n\t"
5280             "pshufd  $tmp,$src2,0xE\n\t"
5281             "addsd   $dst,$tmp\t! add reduction2D" %}
5282   ins_encode %{
5283     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5284     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5285     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5286   %}
5287   ins_pipe( pipe_slow );
5288 %}
5289 
5290 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5291   predicate(UseAVX > 0);
5292   match(Set dst (AddReductionVD dst src2));
5293   effect(TEMP tmp, TEMP dst);
5294   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5295             "pshufd  $tmp,$src2,0xE\n\t"
5296             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5297   ins_encode %{
5298     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5299     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5300     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5301   %}
5302   ins_pipe( pipe_slow );
5303 %}
5304 
5305 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5306   predicate(UseAVX > 0);
5307   match(Set dst (AddReductionVD dst src2));
5308   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5309   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5310             "pshufd  $tmp,$src2,0xE\n\t"
5311             "vaddsd  $dst,$dst,$tmp\n\t"
5312             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5313             "vaddsd  $dst,$dst,$tmp2\n\t"
5314             "pshufd  $tmp,$tmp2,0xE\n\t"
5315             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5316   ins_encode %{
5317     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5318     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5319     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5320     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5321     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5322     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5323     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5324   %}
5325   ins_pipe( pipe_slow );
5326 %}
5327 
5328 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5329   predicate(UseAVX > 2);
5330   match(Set dst (AddReductionVD dst src2));
5331   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5332   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5333             "pshufd  $tmp,$src2,0xE\n\t"
5334             "vaddsd  $dst,$dst,$tmp\n\t"
5335             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5336             "vaddsd  $dst,$dst,$tmp2\n\t"
5337             "pshufd  $tmp,$tmp2,0xE\n\t"
5338             "vaddsd  $dst,$dst,$tmp\n\t"
5339             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5340             "vaddsd  $dst,$dst,$tmp2\n\t"
5341             "pshufd  $tmp,$tmp2,0xE\n\t"
5342             "vaddsd  $dst,$dst,$tmp\n\t"
5343             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5344             "vaddsd  $dst,$dst,$tmp2\n\t"
5345             "pshufd  $tmp,$tmp2,0xE\n\t"
5346             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5347   ins_encode %{
5348     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5349     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5350     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5351     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5352     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5353     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5354     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5355     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5356     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5357     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5358     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5359     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5360     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5361     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5362     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5363   %}
5364   ins_pipe( pipe_slow );
5365 %}
5366 
5367 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5368   predicate(UseSSE > 3 && UseAVX == 0);
5369   match(Set dst (MulReductionVI src1 src2));
5370   effect(TEMP tmp, TEMP tmp2);
5371   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5372             "pmulld  $tmp2,$src2\n\t"
5373             "movd    $tmp,$src1\n\t"
5374             "pmulld  $tmp2,$tmp\n\t"
5375             "movd    $dst,$tmp2\t! mul reduction2I" %}
5376   ins_encode %{
5377     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5378     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5379     __ movdl($tmp$$XMMRegister, $src1$$Register);
5380     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5381     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5382   %}
5383   ins_pipe( pipe_slow );
5384 %}
5385 
5386 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5387   predicate(UseAVX > 0);
5388   match(Set dst (MulReductionVI src1 src2));
5389   effect(TEMP tmp, TEMP tmp2);
5390   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5391             "vpmulld  $tmp,$src2,$tmp2\n\t"
5392             "movd     $tmp2,$src1\n\t"
5393             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5394             "movd     $dst,$tmp2\t! mul reduction2I" %}
5395   ins_encode %{
5396     int vector_len = 0;
5397     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5398     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5399     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5400     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5401     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5402   %}
5403   ins_pipe( pipe_slow );
5404 %}
5405 
5406 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5407   predicate(UseSSE > 3 && UseAVX == 0);
5408   match(Set dst (MulReductionVI src1 src2));
5409   effect(TEMP tmp, TEMP tmp2);
5410   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5411             "pmulld  $tmp2,$src2\n\t"
5412             "pshufd  $tmp,$tmp2,0x1\n\t"
5413             "pmulld  $tmp2,$tmp\n\t"
5414             "movd    $tmp,$src1\n\t"
5415             "pmulld  $tmp2,$tmp\n\t"
5416             "movd    $dst,$tmp2\t! mul reduction4I" %}
5417   ins_encode %{
5418     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5419     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5420     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5421     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5422     __ movdl($tmp$$XMMRegister, $src1$$Register);
5423     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5424     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5425   %}
5426   ins_pipe( pipe_slow );
5427 %}
5428 
5429 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5430   predicate(UseAVX > 0);
5431   match(Set dst (MulReductionVI src1 src2));
5432   effect(TEMP tmp, TEMP tmp2);
5433   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5434             "vpmulld  $tmp,$src2,$tmp2\n\t"
5435             "pshufd   $tmp2,$tmp,0x1\n\t"
5436             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5437             "movd     $tmp2,$src1\n\t"
5438             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5439             "movd     $dst,$tmp2\t! mul reduction4I" %}
5440   ins_encode %{
5441     int vector_len = 0;
5442     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5443     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5444     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5445     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5446     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5447     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5448     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5449   %}
5450   ins_pipe( pipe_slow );
5451 %}
5452 
5453 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5454   predicate(UseAVX > 0);
5455   match(Set dst (MulReductionVI src1 src2));
5456   effect(TEMP tmp, TEMP tmp2);
5457   format %{ "vextracti128_high  $tmp,$src2\n\t"
5458             "vpmulld  $tmp,$tmp,$src2\n\t"
5459             "pshufd   $tmp2,$tmp,0xE\n\t"
5460             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5461             "pshufd   $tmp2,$tmp,0x1\n\t"
5462             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5463             "movd     $tmp2,$src1\n\t"
5464             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5465             "movd     $dst,$tmp2\t! mul reduction8I" %}
5466   ins_encode %{
5467     int vector_len = 0;
5468     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5469     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5470     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5471     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5472     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5473     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5474     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5475     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5476     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5477   %}
5478   ins_pipe( pipe_slow );
5479 %}
5480 
5481 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5482   predicate(UseAVX > 2);
5483   match(Set dst (MulReductionVI src1 src2));
5484   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5485   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5486             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5487             "vextracti128_high  $tmp,$tmp3\n\t"
5488             "vpmulld  $tmp,$tmp,$src2\n\t"
5489             "pshufd   $tmp2,$tmp,0xE\n\t"
5490             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5491             "pshufd   $tmp2,$tmp,0x1\n\t"
5492             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5493             "movd     $tmp2,$src1\n\t"
5494             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5495             "movd     $dst,$tmp2\t! mul reduction16I" %}
5496   ins_encode %{
5497     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5498     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5499     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5500     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5501     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5502     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5503     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5504     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5505     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5506     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5507     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5508   %}
5509   ins_pipe( pipe_slow );
5510 %}
5511 
5512 #ifdef _LP64
5513 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5514   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5515   match(Set dst (MulReductionVL src1 src2));
5516   effect(TEMP tmp, TEMP tmp2);
5517   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5518             "vpmullq  $tmp,$src2,$tmp2\n\t"
5519             "movdq    $tmp2,$src1\n\t"
5520             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5521             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5522   ins_encode %{
5523     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5524     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5525     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5526     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5527     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5528   %}
5529   ins_pipe( pipe_slow );
5530 %}
5531 
5532 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5533   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5534   match(Set dst (MulReductionVL src1 src2));
5535   effect(TEMP tmp, TEMP tmp2);
5536   format %{ "vextracti128_high  $tmp,$src2\n\t"
5537             "vpmullq  $tmp2,$tmp,$src2\n\t"
5538             "pshufd   $tmp,$tmp2,0xE\n\t"
5539             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5540             "movdq    $tmp,$src1\n\t"
5541             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5542             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5543   ins_encode %{
5544     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5545     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5546     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5547     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5548     __ movdq($tmp$$XMMRegister, $src1$$Register);
5549     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5550     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5551   %}
5552   ins_pipe( pipe_slow );
5553 %}
5554 
5555 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5556   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5557   match(Set dst (MulReductionVL src1 src2));
5558   effect(TEMP tmp, TEMP tmp2);
5559   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5560             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5561             "vextracti128_high  $tmp,$tmp2\n\t"
5562             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5563             "pshufd   $tmp,$tmp2,0xE\n\t"
5564             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5565             "movdq    $tmp,$src1\n\t"
5566             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5567             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5568   ins_encode %{
5569     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5570     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5571     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5572     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5573     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5574     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5575     __ movdq($tmp$$XMMRegister, $src1$$Register);
5576     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5577     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5578   %}
5579   ins_pipe( pipe_slow );
5580 %}
5581 #endif
5582 
5583 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5584   predicate(UseSSE >= 1 && UseAVX == 0);
5585   match(Set dst (MulReductionVF dst src2));
5586   effect(TEMP dst, TEMP tmp);
5587   format %{ "mulss   $dst,$src2\n\t"
5588             "pshufd  $tmp,$src2,0x01\n\t"
5589             "mulss   $dst,$tmp\t! mul reduction2F" %}
5590   ins_encode %{
5591     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5592     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5593     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5594   %}
5595   ins_pipe( pipe_slow );
5596 %}
5597 
5598 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5599   predicate(UseAVX > 0);
5600   match(Set dst (MulReductionVF dst src2));
5601   effect(TEMP tmp, TEMP dst);
5602   format %{ "vmulss  $dst,$dst,$src2\n\t"
5603             "pshufd  $tmp,$src2,0x01\n\t"
5604             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5605   ins_encode %{
5606     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5607     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5608     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5609   %}
5610   ins_pipe( pipe_slow );
5611 %}
5612 
5613 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5614   predicate(UseSSE >= 1 && UseAVX == 0);
5615   match(Set dst (MulReductionVF dst src2));
5616   effect(TEMP dst, TEMP tmp);
5617   format %{ "mulss   $dst,$src2\n\t"
5618             "pshufd  $tmp,$src2,0x01\n\t"
5619             "mulss   $dst,$tmp\n\t"
5620             "pshufd  $tmp,$src2,0x02\n\t"
5621             "mulss   $dst,$tmp\n\t"
5622             "pshufd  $tmp,$src2,0x03\n\t"
5623             "mulss   $dst,$tmp\t! mul reduction4F" %}
5624   ins_encode %{
5625     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5626     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5627     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5628     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5629     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5630     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5631     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5632   %}
5633   ins_pipe( pipe_slow );
5634 %}
5635 
5636 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5637   predicate(UseAVX > 0);
5638   match(Set dst (MulReductionVF dst src2));
5639   effect(TEMP tmp, TEMP dst);
5640   format %{ "vmulss  $dst,$dst,$src2\n\t"
5641             "pshufd  $tmp,$src2,0x01\n\t"
5642             "vmulss  $dst,$dst,$tmp\n\t"
5643             "pshufd  $tmp,$src2,0x02\n\t"
5644             "vmulss  $dst,$dst,$tmp\n\t"
5645             "pshufd  $tmp,$src2,0x03\n\t"
5646             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5647   ins_encode %{
5648     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5649     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5650     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5651     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5652     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5653     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5654     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5655   %}
5656   ins_pipe( pipe_slow );
5657 %}
5658 
5659 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5660   predicate(UseAVX > 0);
5661   match(Set dst (MulReductionVF dst src2));
5662   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5663   format %{ "vmulss  $dst,$dst,$src2\n\t"
5664             "pshufd  $tmp,$src2,0x01\n\t"
5665             "vmulss  $dst,$dst,$tmp\n\t"
5666             "pshufd  $tmp,$src2,0x02\n\t"
5667             "vmulss  $dst,$dst,$tmp\n\t"
5668             "pshufd  $tmp,$src2,0x03\n\t"
5669             "vmulss  $dst,$dst,$tmp\n\t"
5670             "vextractf128_high  $tmp2,$src2\n\t"
5671             "vmulss  $dst,$dst,$tmp2\n\t"
5672             "pshufd  $tmp,$tmp2,0x01\n\t"
5673             "vmulss  $dst,$dst,$tmp\n\t"
5674             "pshufd  $tmp,$tmp2,0x02\n\t"
5675             "vmulss  $dst,$dst,$tmp\n\t"
5676             "pshufd  $tmp,$tmp2,0x03\n\t"
5677             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5678   ins_encode %{
5679     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5680     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5681     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5682     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5683     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5684     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5685     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5686     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5687     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5688     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5689     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5690     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5691     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5692     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5693     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5694   %}
5695   ins_pipe( pipe_slow );
5696 %}
5697 
5698 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5699   predicate(UseAVX > 2);
5700   match(Set dst (MulReductionVF dst src2));
5701   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5702   format %{ "vmulss  $dst,$dst,$src2\n\t"
5703             "pshufd  $tmp,$src2,0x01\n\t"
5704             "vmulss  $dst,$dst,$tmp\n\t"
5705             "pshufd  $tmp,$src2,0x02\n\t"
5706             "vmulss  $dst,$dst,$tmp\n\t"
5707             "pshufd  $tmp,$src2,0x03\n\t"
5708             "vmulss  $dst,$dst,$tmp\n\t"
5709             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5710             "vmulss  $dst,$dst,$tmp2\n\t"
5711             "pshufd  $tmp,$tmp2,0x01\n\t"
5712             "vmulss  $dst,$dst,$tmp\n\t"
5713             "pshufd  $tmp,$tmp2,0x02\n\t"
5714             "vmulss  $dst,$dst,$tmp\n\t"
5715             "pshufd  $tmp,$tmp2,0x03\n\t"
5716             "vmulss  $dst,$dst,$tmp\n\t"
5717             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5718             "vmulss  $dst,$dst,$tmp2\n\t"
5719             "pshufd  $tmp,$tmp2,0x01\n\t"
5720             "vmulss  $dst,$dst,$tmp\n\t"
5721             "pshufd  $tmp,$tmp2,0x02\n\t"
5722             "vmulss  $dst,$dst,$tmp\n\t"
5723             "pshufd  $tmp,$tmp2,0x03\n\t"
5724             "vmulss  $dst,$dst,$tmp\n\t"
5725             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5726             "vmulss  $dst,$dst,$tmp2\n\t"
5727             "pshufd  $tmp,$tmp2,0x01\n\t"
5728             "vmulss  $dst,$dst,$tmp\n\t"
5729             "pshufd  $tmp,$tmp2,0x02\n\t"
5730             "vmulss  $dst,$dst,$tmp\n\t"
5731             "pshufd  $tmp,$tmp2,0x03\n\t"
5732             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5733   ins_encode %{
5734     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5735     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5736     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5737     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5738     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5739     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5740     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5741     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5742     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5743     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5744     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5745     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5746     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5747     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5748     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5749     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5750     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5751     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5752     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5753     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5754     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5755     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5756     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5757     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5758     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5759     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5760     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5761     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5762     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5763     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5764     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5765   %}
5766   ins_pipe( pipe_slow );
5767 %}
5768 
5769 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5770   predicate(UseSSE >= 1 && UseAVX == 0);
5771   match(Set dst (MulReductionVD dst src2));
5772   effect(TEMP dst, TEMP tmp);
5773   format %{ "mulsd   $dst,$src2\n\t"
5774             "pshufd  $tmp,$src2,0xE\n\t"
5775             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5776   ins_encode %{
5777     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5778     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5779     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5780   %}
5781   ins_pipe( pipe_slow );
5782 %}
5783 
5784 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5785   predicate(UseAVX > 0);
5786   match(Set dst (MulReductionVD dst src2));
5787   effect(TEMP tmp, TEMP dst);
5788   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5789             "pshufd  $tmp,$src2,0xE\n\t"
5790             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5791   ins_encode %{
5792     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5793     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5794     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5795   %}
5796   ins_pipe( pipe_slow );
5797 %}
5798 
5799 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5800   predicate(UseAVX > 0);
5801   match(Set dst (MulReductionVD dst src2));
5802   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5803   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5804             "pshufd  $tmp,$src2,0xE\n\t"
5805             "vmulsd  $dst,$dst,$tmp\n\t"
5806             "vextractf128_high  $tmp2,$src2\n\t"
5807             "vmulsd  $dst,$dst,$tmp2\n\t"
5808             "pshufd  $tmp,$tmp2,0xE\n\t"
5809             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5810   ins_encode %{
5811     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5812     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5813     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5814     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5815     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5816     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5817     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5818   %}
5819   ins_pipe( pipe_slow );
5820 %}
5821 
5822 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5823   predicate(UseAVX > 2);
5824   match(Set dst (MulReductionVD dst src2));
5825   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5826   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5827             "pshufd  $tmp,$src2,0xE\n\t"
5828             "vmulsd  $dst,$dst,$tmp\n\t"
5829             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5830             "vmulsd  $dst,$dst,$tmp2\n\t"
5831             "pshufd  $tmp,$src2,0xE\n\t"
5832             "vmulsd  $dst,$dst,$tmp\n\t"
5833             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5834             "vmulsd  $dst,$dst,$tmp2\n\t"
5835             "pshufd  $tmp,$tmp2,0xE\n\t"
5836             "vmulsd  $dst,$dst,$tmp\n\t"
5837             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5838             "vmulsd  $dst,$dst,$tmp2\n\t"
5839             "pshufd  $tmp,$tmp2,0xE\n\t"
5840             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5841   ins_encode %{
5842     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5843     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5844     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5845     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5846     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5847     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5848     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5849     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5850     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5851     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5852     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5853     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5854     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5855     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5856     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5857   %}
5858   ins_pipe( pipe_slow );
5859 %}
5860 
5861 instruct rsand2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5862   predicate(UseSSE > 1 && n->in(1)->bottom_type()->basic_type() == T_INT);
5863   match(Set dst (AndReductionV src1 src2));
5864   effect(TEMP tmp, TEMP tmp2);
5865   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5866             "pand    $tmp2,$src2\n\t"
5867             "movd    $tmp,$src1\n\t"
5868             "pand    $tmp2,$tmp\n\t"
5869             "movd    $dst,$tmp2\t! and reduction2I" %}
5870   ins_encode %{
5871     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5872     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
5873     __ movdl($tmp$$XMMRegister, $src1$$Register);
5874     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
5875     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5876   %}
5877   ins_pipe( pipe_slow );
5878 %}
5879 
5880 instruct rsand4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5881   predicate(UseSSE > 1 && n->in(1)->bottom_type()->basic_type() == T_INT);
5882   match(Set dst (AndReductionV src1 src2));
5883   effect(TEMP tmp, TEMP tmp2);
5884   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5885             "pand    $tmp2,$src2\n\t"
5886             "pshufd  $tmp,$tmp2,0x1\n\t"
5887             "pand    $tmp2,$tmp\n\t"
5888             "movd    $tmp,$src1\n\t"
5889             "pand    $tmp2,$tmp\n\t"
5890             "movd    $dst,$tmp2\t! and reduction4I" %}
5891   ins_encode %{
5892     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5893     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
5894     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5895     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
5896     __ movdl($tmp$$XMMRegister, $src1$$Register);
5897     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
5898     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5899   %}
5900   ins_pipe( pipe_slow );
5901 %}
5902 
5903 instruct rvand8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5904   predicate(UseAVX > 0 && n->in(1)->bottom_type()->basic_type() == T_INT);
5905   match(Set dst (AndReductionV src1 src2));
5906   effect(TEMP tmp, TEMP tmp2);
5907   format %{ "vextracti128_high  $tmp,$src2\n\t"
5908             "vpand    $tmp,$tmp,$src2\n\t"
5909             "vpshufd   $tmp2,$tmp,0xE\n\t"
5910             "vpand    $tmp,$tmp,$tmp2\n\t"
5911             "vpshufd   $tmp2,$tmp,0x1\n\t"
5912             "vpand    $tmp,$tmp,$tmp2\n\t"
5913             "movd     $tmp2,$src1\n\t"
5914             "vpand    $tmp2,$tmp,$tmp2\n\t"
5915             "movd     $dst,$tmp2\t! and reduction8I" %}
5916   ins_encode %{
5917     int vector_len = 0;
5918     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5919     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5920     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
5921     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5922     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
5923     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5924     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5925     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5926     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5927   %}
5928   ins_pipe( pipe_slow );
5929 %}
5930 
5931 instruct rvand16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5932   predicate(UseAVX > 2 && n->in(1)->bottom_type()->basic_type() == T_INT);
5933   match(Set dst (AndReductionV src1 src2));
5934   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5935   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5936             "vpand  $tmp3,$tmp3,$src2\n\t"
5937             "vextracti128_high  $tmp,$tmp3\n\t"
5938             "vpand    $tmp,$tmp,$src2\n\t"
5939             "vpshufd   $tmp2,$tmp,0xE\n\t"
5940             "vpand    $tmp,$tmp,$tmp2\n\t"
5941             "vpshufd   $tmp2,$tmp,0x1\n\t"
5942             "vpand    $tmp,$tmp,$tmp2\n\t"
5943             "movd     $tmp2,$src1\n\t"
5944             "vpand    $tmp2,$tmp,$tmp2\n\t"
5945             "movd     $dst,$tmp2\t! and reduction16I" %}
5946   ins_encode %{
5947     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5948     __ vpand($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5949     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5950     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5951     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, 0);
5952     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5953     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, 0);
5954     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5955     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5956     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5957     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5958   %}
5959   ins_pipe( pipe_slow );
5960 %}
5961 
5962 instruct rsand2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5963   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->basic_type() == T_LONG);
5964   match(Set dst (AndReductionV src1 src2));
5965   effect(TEMP tmp, TEMP tmp2);
5966   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5967             "pand    $tmp2,$src2\n\t"
5968             "movdq   $tmp,$src1\n\t"
5969             "pand    $tmp2,$tmp\n\t"
5970             "movq   $dst,$tmp2\t! and reduction2L" %}
5971   ins_encode %{
5972     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5973     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
5974     __ movdq($tmp$$XMMRegister, $src1$$Register);
5975     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
5976     __ movq($dst$$Register, $tmp2$$XMMRegister);
5977   %}
5978   ins_pipe( pipe_slow );
5979 %}
5980 
5981 instruct rvand4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5982   predicate(UseAVX > 0 && n->in(1)->bottom_type()->basic_type() == T_LONG);
5983   match(Set dst (AndReductionV src1 src2));
5984   effect(TEMP tmp, TEMP tmp2);
5985   format %{ "vextracti128_high  $tmp,$src2\n\t"
5986             "vpand  $tmp2,$tmp,$src2\n\t"
5987             "vpshufd  $tmp,$tmp2,0xE\n\t"
5988             "vpand  $tmp2,$tmp2,$tmp\n\t"
5989             "movq   $tmp,$src1\n\t"
5990             "vpand  $tmp2,$tmp2,$tmp\n\t"
5991             "movq   $dst,$tmp2\t! and reduction4L" %}
5992   ins_encode %{
5993     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5994     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5995     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
5996     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5997     __ movq($tmp$$XMMRegister, $src1$$Register);
5998     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5999     __ movq($dst$$Register, $tmp2$$XMMRegister);
6000   %}
6001   ins_pipe( pipe_slow );
6002 %}
6003 
6004 #ifdef _LP64
6005 instruct rvand8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
6006   predicate(UseAVX > 2 && n->in(1)->bottom_type()->basic_type() == T_LONG);
6007   match(Set dst (AndReductionV src1 src2));
6008   effect(TEMP tmp, TEMP tmp2);
6009   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6010             "vpandq  $tmp2,$tmp2,$src2\n\t"
6011             "vextracti128_high  $tmp,$tmp2\n\t"
6012             "vpandq  $tmp2,$tmp2,$tmp\n\t"
6013             "vpshufd  $tmp,$tmp2,0xE\n\t"
6014             "vpandq  $tmp2,$tmp2,$tmp\n\t"
6015             "movdq   $tmp,$src1\n\t"
6016             "vpandq  $tmp2,$tmp2,$tmp\n\t"
6017             "movdq   $dst,$tmp2\t! and reduction8L" %}
6018   ins_encode %{
6019     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6020     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6021     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6022     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6023     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
6024     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6025     __ movdq($tmp$$XMMRegister, $src1$$Register);
6026     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6027     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6028   %}
6029   ins_pipe( pipe_slow );
6030 %}
6031 #endif
6032 
6033 // ====================VECTOR ARITHMETIC=======================================
6034 
6035 // --------------------------------- ADD --------------------------------------
6036 
6037 // Bytes vector add
6038 instruct vadd4B(vecS dst, vecS src) %{
6039   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6040   match(Set dst (AddVB dst src));
6041   format %{ "paddb   $dst,$src\t! add packed4B" %}
6042   ins_encode %{
6043     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6044   %}
6045   ins_pipe( pipe_slow );
6046 %}
6047 
6048 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6049   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6050   match(Set dst (AddVB src1 src2));
6051   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
6052   ins_encode %{
6053     int vector_len = 0;
6054     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6055   %}
6056   ins_pipe( pipe_slow );
6057 %}
6058 
6059 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6060   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6061   match(Set dst (AddVB src1 src2));
6062   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
6063   ins_encode %{
6064     int vector_len = 0;
6065     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6066   %}
6067   ins_pipe( pipe_slow );
6068 %}
6069 
6070 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6071   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6072   match(Set dst (AddVB dst src2));
6073   effect(TEMP src1);
6074   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
6075   ins_encode %{
6076     int vector_len = 0;
6077     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6078   %}
6079   ins_pipe( pipe_slow );
6080 %}
6081 
6082 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
6083   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6084   match(Set dst (AddVB src (LoadVector mem)));
6085   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6086   ins_encode %{
6087     int vector_len = 0;
6088     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6089   %}
6090   ins_pipe( pipe_slow );
6091 %}
6092 
6093 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
6094   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6095   match(Set dst (AddVB src (LoadVector mem)));
6096   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6097   ins_encode %{
6098     int vector_len = 0;
6099     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6100   %}
6101   ins_pipe( pipe_slow );
6102 %}
6103 
6104 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6105   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6106   match(Set dst (AddVB dst (LoadVector mem)));
6107   effect(TEMP src);
6108   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6109   ins_encode %{
6110     int vector_len = 0;
6111     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6112   %}
6113   ins_pipe( pipe_slow );
6114 %}
6115 
6116 instruct vadd8B(vecD dst, vecD src) %{
6117   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6118   match(Set dst (AddVB dst src));
6119   format %{ "paddb   $dst,$src\t! add packed8B" %}
6120   ins_encode %{
6121     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6122   %}
6123   ins_pipe( pipe_slow );
6124 %}
6125 
6126 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6127   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6128   match(Set dst (AddVB src1 src2));
6129   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6130   ins_encode %{
6131     int vector_len = 0;
6132     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6133   %}
6134   ins_pipe( pipe_slow );
6135 %}
6136 
6137 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6138   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6139   match(Set dst (AddVB src1 src2));
6140   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6141   ins_encode %{
6142     int vector_len = 0;
6143     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6144   %}
6145   ins_pipe( pipe_slow );
6146 %}
6147 
6148 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6149   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6150   match(Set dst (AddVB dst src2));
6151   effect(TEMP src1);
6152   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
6153   ins_encode %{
6154     int vector_len = 0;
6155     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6156   %}
6157   ins_pipe( pipe_slow );
6158 %}
6159 
6160 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
6161   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6162   match(Set dst (AddVB src (LoadVector mem)));
6163   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6164   ins_encode %{
6165     int vector_len = 0;
6166     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6167   %}
6168   ins_pipe( pipe_slow );
6169 %}
6170 
6171 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
6172   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6173   match(Set dst (AddVB src (LoadVector mem)));
6174   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6175   ins_encode %{
6176     int vector_len = 0;
6177     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6178   %}
6179   ins_pipe( pipe_slow );
6180 %}
6181 
6182 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6183   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6184   match(Set dst (AddVB dst (LoadVector mem)));
6185   effect(TEMP src);
6186   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6187   ins_encode %{
6188     int vector_len = 0;
6189     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6190   %}
6191   ins_pipe( pipe_slow );
6192 %}
6193 
6194 instruct vadd16B(vecX dst, vecX src) %{
6195   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6196   match(Set dst (AddVB dst src));
6197   format %{ "paddb   $dst,$src\t! add packed16B" %}
6198   ins_encode %{
6199     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6200   %}
6201   ins_pipe( pipe_slow );
6202 %}
6203 
6204 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6205   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6206   match(Set dst (AddVB src1 src2));
6207   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6208   ins_encode %{
6209     int vector_len = 0;
6210     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6211   %}
6212   ins_pipe( pipe_slow );
6213 %}
6214 
6215 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6216   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6217   match(Set dst (AddVB src1 src2));
6218   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6219   ins_encode %{
6220     int vector_len = 0;
6221     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6222   %}
6223   ins_pipe( pipe_slow );
6224 %}
6225 
6226 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6227   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6228   match(Set dst (AddVB dst src2));
6229   effect(TEMP src1);
6230   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
6231   ins_encode %{
6232     int vector_len = 0;
6233     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6234   %}
6235   ins_pipe( pipe_slow );
6236 %}
6237 
6238 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
6239   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6240   match(Set dst (AddVB src (LoadVector mem)));
6241   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6242   ins_encode %{
6243     int vector_len = 0;
6244     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6245   %}
6246   ins_pipe( pipe_slow );
6247 %}
6248 
6249 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
6250   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6251   match(Set dst (AddVB src (LoadVector mem)));
6252   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6253   ins_encode %{
6254     int vector_len = 0;
6255     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6256   %}
6257   ins_pipe( pipe_slow );
6258 %}
6259 
6260 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6261   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6262   match(Set dst (AddVB dst (LoadVector mem)));
6263   effect(TEMP src);
6264   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6265   ins_encode %{
6266     int vector_len = 0;
6267     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6268   %}
6269   ins_pipe( pipe_slow );
6270 %}
6271 
6272 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6273   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6274   match(Set dst (AddVB src1 src2));
6275   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6276   ins_encode %{
6277     int vector_len = 1;
6278     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6279   %}
6280   ins_pipe( pipe_slow );
6281 %}
6282 
6283 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6284   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6285   match(Set dst (AddVB src1 src2));
6286   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6287   ins_encode %{
6288     int vector_len = 1;
6289     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6290   %}
6291   ins_pipe( pipe_slow );
6292 %}
6293 
6294 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6295   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6296   match(Set dst (AddVB dst src2));
6297   effect(TEMP src1);
6298   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
6299   ins_encode %{
6300     int vector_len = 1;
6301     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6302   %}
6303   ins_pipe( pipe_slow );
6304 %}
6305 
6306 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
6307   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6308   match(Set dst (AddVB src (LoadVector mem)));
6309   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6310   ins_encode %{
6311     int vector_len = 1;
6312     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6313   %}
6314   ins_pipe( pipe_slow );
6315 %}
6316 
6317 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
6318   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6319   match(Set dst (AddVB src (LoadVector mem)));
6320   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6321   ins_encode %{
6322     int vector_len = 1;
6323     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6324   %}
6325   ins_pipe( pipe_slow );
6326 %}
6327 
6328 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6329   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6330   match(Set dst (AddVB dst (LoadVector mem)));
6331   effect(TEMP src);
6332   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6333   ins_encode %{
6334     int vector_len = 1;
6335     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6336   %}
6337   ins_pipe( pipe_slow );
6338 %}
6339 
6340 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6341   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6342   match(Set dst (AddVB src1 src2));
6343   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6344   ins_encode %{
6345     int vector_len = 2;
6346     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6347   %}
6348   ins_pipe( pipe_slow );
6349 %}
6350 
6351 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6352   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6353   match(Set dst (AddVB src (LoadVector mem)));
6354   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6355   ins_encode %{
6356     int vector_len = 2;
6357     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6358   %}
6359   ins_pipe( pipe_slow );
6360 %}
6361 
6362 // Shorts/Chars vector add
6363 instruct vadd2S(vecS dst, vecS src) %{
6364   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6365   match(Set dst (AddVS dst src));
6366   format %{ "paddw   $dst,$src\t! add packed2S" %}
6367   ins_encode %{
6368     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6369   %}
6370   ins_pipe( pipe_slow );
6371 %}
6372 
6373 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6374   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6375   match(Set dst (AddVS src1 src2));
6376   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6377   ins_encode %{
6378     int vector_len = 0;
6379     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6380   %}
6381   ins_pipe( pipe_slow );
6382 %}
6383 
6384 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6385   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6386   match(Set dst (AddVS src1 src2));
6387   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6388   ins_encode %{
6389     int vector_len = 0;
6390     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6391   %}
6392   ins_pipe( pipe_slow );
6393 %}
6394 
6395 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6396   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6397   match(Set dst (AddVS dst src2));
6398   effect(TEMP src1);
6399   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
6400   ins_encode %{
6401     int vector_len = 0;
6402     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6403   %}
6404   ins_pipe( pipe_slow );
6405 %}
6406 
6407 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
6408   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6409   match(Set dst (AddVS src (LoadVector mem)));
6410   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6411   ins_encode %{
6412     int vector_len = 0;
6413     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6414   %}
6415   ins_pipe( pipe_slow );
6416 %}
6417 
6418 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
6419   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6420   match(Set dst (AddVS src (LoadVector mem)));
6421   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6422   ins_encode %{
6423     int vector_len = 0;
6424     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6425   %}
6426   ins_pipe( pipe_slow );
6427 %}
6428 
6429 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6430   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6431   match(Set dst (AddVS dst (LoadVector mem)));
6432   effect(TEMP src);
6433   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6434   ins_encode %{
6435     int vector_len = 0;
6436     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6437   %}
6438   ins_pipe( pipe_slow );
6439 %}
6440 
6441 instruct vadd4S(vecD dst, vecD src) %{
6442   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6443   match(Set dst (AddVS dst src));
6444   format %{ "paddw   $dst,$src\t! add packed4S" %}
6445   ins_encode %{
6446     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6447   %}
6448   ins_pipe( pipe_slow );
6449 %}
6450 
6451 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
6452   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6453   match(Set dst (AddVS src1 src2));
6454   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6455   ins_encode %{
6456     int vector_len = 0;
6457     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6458   %}
6459   ins_pipe( pipe_slow );
6460 %}
6461 
6462 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6463   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6464   match(Set dst (AddVS src1 src2));
6465   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6466   ins_encode %{
6467     int vector_len = 0;
6468     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6469   %}
6470   ins_pipe( pipe_slow );
6471 %}
6472 
6473 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6474   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6475   match(Set dst (AddVS dst src2));
6476   effect(TEMP src1);
6477   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
6478   ins_encode %{
6479     int vector_len = 0;
6480     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6481   %}
6482   ins_pipe( pipe_slow );
6483 %}
6484 
6485 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
6486   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6487   match(Set dst (AddVS src (LoadVector mem)));
6488   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6489   ins_encode %{
6490     int vector_len = 0;
6491     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6492   %}
6493   ins_pipe( pipe_slow );
6494 %}
6495 
6496 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
6497   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6498   match(Set dst (AddVS src (LoadVector mem)));
6499   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6500   ins_encode %{
6501     int vector_len = 0;
6502     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6503   %}
6504   ins_pipe( pipe_slow );
6505 %}
6506 
6507 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6508   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6509   match(Set dst (AddVS dst (LoadVector mem)));
6510   effect(TEMP src);
6511   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6512   ins_encode %{
6513     int vector_len = 0;
6514     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6515   %}
6516   ins_pipe( pipe_slow );
6517 %}
6518 
6519 instruct vadd8S(vecX dst, vecX src) %{
6520   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6521   match(Set dst (AddVS dst src));
6522   format %{ "paddw   $dst,$src\t! add packed8S" %}
6523   ins_encode %{
6524     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6525   %}
6526   ins_pipe( pipe_slow );
6527 %}
6528 
6529 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6530   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6531   match(Set dst (AddVS src1 src2));
6532   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6533   ins_encode %{
6534     int vector_len = 0;
6535     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6536   %}
6537   ins_pipe( pipe_slow );
6538 %}
6539 
6540 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6541   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6542   match(Set dst (AddVS src1 src2));
6543   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6544   ins_encode %{
6545     int vector_len = 0;
6546     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6547   %}
6548   ins_pipe( pipe_slow );
6549 %}
6550 
6551 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6552   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6553   match(Set dst (AddVS dst src2));
6554   effect(TEMP src1);
6555   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
6556   ins_encode %{
6557     int vector_len = 0;
6558     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6559   %}
6560   ins_pipe( pipe_slow );
6561 %}
6562 
6563 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
6564   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6565   match(Set dst (AddVS src (LoadVector mem)));
6566   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6567   ins_encode %{
6568     int vector_len = 0;
6569     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6570   %}
6571   ins_pipe( pipe_slow );
6572 %}
6573 
6574 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
6575   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6576   match(Set dst (AddVS src (LoadVector mem)));
6577   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6578   ins_encode %{
6579     int vector_len = 0;
6580     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6581   %}
6582   ins_pipe( pipe_slow );
6583 %}
6584 
6585 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
6586   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6587   match(Set dst (AddVS dst (LoadVector mem)));
6588   effect(TEMP src);
6589   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6590   ins_encode %{
6591     int vector_len = 0;
6592     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6593   %}
6594   ins_pipe( pipe_slow );
6595 %}
6596 
6597 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6598   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6599   match(Set dst (AddVS src1 src2));
6600   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6601   ins_encode %{
6602     int vector_len = 1;
6603     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6604   %}
6605   ins_pipe( pipe_slow );
6606 %}
6607 
6608 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6609   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6610   match(Set dst (AddVS src1 src2));
6611   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6612   ins_encode %{
6613     int vector_len = 1;
6614     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6615   %}
6616   ins_pipe( pipe_slow );
6617 %}
6618 
6619 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6620   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6621   match(Set dst (AddVS dst src2));
6622   effect(TEMP src1);
6623   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6624   ins_encode %{
6625     int vector_len = 1;
6626     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6627   %}
6628   ins_pipe( pipe_slow );
6629 %}
6630 
6631 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6632   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6633   match(Set dst (AddVS src (LoadVector mem)));
6634   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6635   ins_encode %{
6636     int vector_len = 1;
6637     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6638   %}
6639   ins_pipe( pipe_slow );
6640 %}
6641 
6642 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6643   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6644   match(Set dst (AddVS src (LoadVector mem)));
6645   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6646   ins_encode %{
6647     int vector_len = 1;
6648     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6649   %}
6650   ins_pipe( pipe_slow );
6651 %}
6652 
6653 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6654   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6655   match(Set dst (AddVS dst (LoadVector mem)));
6656   effect(TEMP src);
6657   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6658   ins_encode %{
6659     int vector_len = 1;
6660     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6661   %}
6662   ins_pipe( pipe_slow );
6663 %}
6664 
6665 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6666   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6667   match(Set dst (AddVS src1 src2));
6668   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6669   ins_encode %{
6670     int vector_len = 2;
6671     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6672   %}
6673   ins_pipe( pipe_slow );
6674 %}
6675 
6676 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6677   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6678   match(Set dst (AddVS src (LoadVector mem)));
6679   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6680   ins_encode %{
6681     int vector_len = 2;
6682     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6683   %}
6684   ins_pipe( pipe_slow );
6685 %}
6686 
6687 // Integers vector add
6688 instruct vadd2I(vecD dst, vecD src) %{
6689   predicate(n->as_Vector()->length() == 2);
6690   match(Set dst (AddVI dst src));
6691   format %{ "paddd   $dst,$src\t! add packed2I" %}
6692   ins_encode %{
6693     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6694   %}
6695   ins_pipe( pipe_slow );
6696 %}
6697 
6698 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6699   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6700   match(Set dst (AddVI src1 src2));
6701   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6702   ins_encode %{
6703     int vector_len = 0;
6704     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6705   %}
6706   ins_pipe( pipe_slow );
6707 %}
6708 
6709 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6710   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6711   match(Set dst (AddVI src (LoadVector mem)));
6712   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6713   ins_encode %{
6714     int vector_len = 0;
6715     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6716   %}
6717   ins_pipe( pipe_slow );
6718 %}
6719 
6720 instruct vadd4I(vecX dst, vecX src) %{
6721   predicate(n->as_Vector()->length() == 4);
6722   match(Set dst (AddVI dst src));
6723   format %{ "paddd   $dst,$src\t! add packed4I" %}
6724   ins_encode %{
6725     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6726   %}
6727   ins_pipe( pipe_slow );
6728 %}
6729 
6730 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6731   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6732   match(Set dst (AddVI src1 src2));
6733   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6734   ins_encode %{
6735     int vector_len = 0;
6736     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6737   %}
6738   ins_pipe( pipe_slow );
6739 %}
6740 
6741 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6742   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6743   match(Set dst (AddVI src (LoadVector mem)));
6744   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6745   ins_encode %{
6746     int vector_len = 0;
6747     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6748   %}
6749   ins_pipe( pipe_slow );
6750 %}
6751 
6752 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6753   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6754   match(Set dst (AddVI src1 src2));
6755   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6756   ins_encode %{
6757     int vector_len = 1;
6758     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6759   %}
6760   ins_pipe( pipe_slow );
6761 %}
6762 
6763 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6764   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6765   match(Set dst (AddVI src (LoadVector mem)));
6766   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6767   ins_encode %{
6768     int vector_len = 1;
6769     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6770   %}
6771   ins_pipe( pipe_slow );
6772 %}
6773 
6774 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6775   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6776   match(Set dst (AddVI src1 src2));
6777   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6778   ins_encode %{
6779     int vector_len = 2;
6780     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6781   %}
6782   ins_pipe( pipe_slow );
6783 %}
6784 
6785 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6786   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6787   match(Set dst (AddVI src (LoadVector mem)));
6788   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6789   ins_encode %{
6790     int vector_len = 2;
6791     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6792   %}
6793   ins_pipe( pipe_slow );
6794 %}
6795 
6796 // Longs vector add
6797 instruct vadd2L(vecX dst, vecX src) %{
6798   predicate(n->as_Vector()->length() == 2);
6799   match(Set dst (AddVL dst src));
6800   format %{ "paddq   $dst,$src\t! add packed2L" %}
6801   ins_encode %{
6802     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6803   %}
6804   ins_pipe( pipe_slow );
6805 %}
6806 
6807 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6808   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6809   match(Set dst (AddVL src1 src2));
6810   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6811   ins_encode %{
6812     int vector_len = 0;
6813     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6814   %}
6815   ins_pipe( pipe_slow );
6816 %}
6817 
6818 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6819   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6820   match(Set dst (AddVL src (LoadVector mem)));
6821   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6822   ins_encode %{
6823     int vector_len = 0;
6824     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6825   %}
6826   ins_pipe( pipe_slow );
6827 %}
6828 
6829 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6830   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6831   match(Set dst (AddVL src1 src2));
6832   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6833   ins_encode %{
6834     int vector_len = 1;
6835     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6836   %}
6837   ins_pipe( pipe_slow );
6838 %}
6839 
6840 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6841   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6842   match(Set dst (AddVL src (LoadVector mem)));
6843   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6844   ins_encode %{
6845     int vector_len = 1;
6846     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6847   %}
6848   ins_pipe( pipe_slow );
6849 %}
6850 
6851 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6852   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6853   match(Set dst (AddVL src1 src2));
6854   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6855   ins_encode %{
6856     int vector_len = 2;
6857     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6858   %}
6859   ins_pipe( pipe_slow );
6860 %}
6861 
6862 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6863   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6864   match(Set dst (AddVL src (LoadVector mem)));
6865   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6866   ins_encode %{
6867     int vector_len = 2;
6868     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6869   %}
6870   ins_pipe( pipe_slow );
6871 %}
6872 
6873 // Floats vector add
6874 instruct vadd2F(vecD dst, vecD src) %{
6875   predicate(n->as_Vector()->length() == 2);
6876   match(Set dst (AddVF dst src));
6877   format %{ "addps   $dst,$src\t! add packed2F" %}
6878   ins_encode %{
6879     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6880   %}
6881   ins_pipe( pipe_slow );
6882 %}
6883 
6884 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6885   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6886   match(Set dst (AddVF src1 src2));
6887   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6888   ins_encode %{
6889     int vector_len = 0;
6890     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6891   %}
6892   ins_pipe( pipe_slow );
6893 %}
6894 
6895 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6896   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6897   match(Set dst (AddVF src (LoadVector mem)));
6898   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6899   ins_encode %{
6900     int vector_len = 0;
6901     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6902   %}
6903   ins_pipe( pipe_slow );
6904 %}
6905 
6906 instruct vadd4F(vecX dst, vecX src) %{
6907   predicate(n->as_Vector()->length() == 4);
6908   match(Set dst (AddVF dst src));
6909   format %{ "addps   $dst,$src\t! add packed4F" %}
6910   ins_encode %{
6911     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6912   %}
6913   ins_pipe( pipe_slow );
6914 %}
6915 
6916 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6917   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6918   match(Set dst (AddVF src1 src2));
6919   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6920   ins_encode %{
6921     int vector_len = 0;
6922     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6923   %}
6924   ins_pipe( pipe_slow );
6925 %}
6926 
6927 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6928   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6929   match(Set dst (AddVF src (LoadVector mem)));
6930   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6931   ins_encode %{
6932     int vector_len = 0;
6933     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6934   %}
6935   ins_pipe( pipe_slow );
6936 %}
6937 
6938 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6939   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6940   match(Set dst (AddVF src1 src2));
6941   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6942   ins_encode %{
6943     int vector_len = 1;
6944     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6945   %}
6946   ins_pipe( pipe_slow );
6947 %}
6948 
6949 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6950   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6951   match(Set dst (AddVF src (LoadVector mem)));
6952   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6953   ins_encode %{
6954     int vector_len = 1;
6955     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6956   %}
6957   ins_pipe( pipe_slow );
6958 %}
6959 
6960 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6961   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6962   match(Set dst (AddVF src1 src2));
6963   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6964   ins_encode %{
6965     int vector_len = 2;
6966     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6967   %}
6968   ins_pipe( pipe_slow );
6969 %}
6970 
6971 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6972   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6973   match(Set dst (AddVF src (LoadVector mem)));
6974   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6975   ins_encode %{
6976     int vector_len = 2;
6977     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6978   %}
6979   ins_pipe( pipe_slow );
6980 %}
6981 
6982 // Doubles vector add
6983 instruct vadd2D(vecX dst, vecX src) %{
6984   predicate(n->as_Vector()->length() == 2);
6985   match(Set dst (AddVD dst src));
6986   format %{ "addpd   $dst,$src\t! add packed2D" %}
6987   ins_encode %{
6988     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6989   %}
6990   ins_pipe( pipe_slow );
6991 %}
6992 
6993 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6994   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6995   match(Set dst (AddVD src1 src2));
6996   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6997   ins_encode %{
6998     int vector_len = 0;
6999     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7000   %}
7001   ins_pipe( pipe_slow );
7002 %}
7003 
7004 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
7005   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7006   match(Set dst (AddVD src (LoadVector mem)));
7007   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
7008   ins_encode %{
7009     int vector_len = 0;
7010     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7011   %}
7012   ins_pipe( pipe_slow );
7013 %}
7014 
7015 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
7016   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7017   match(Set dst (AddVD src1 src2));
7018   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
7019   ins_encode %{
7020     int vector_len = 1;
7021     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7022   %}
7023   ins_pipe( pipe_slow );
7024 %}
7025 
7026 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
7027   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7028   match(Set dst (AddVD src (LoadVector mem)));
7029   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
7030   ins_encode %{
7031     int vector_len = 1;
7032     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7033   %}
7034   ins_pipe( pipe_slow );
7035 %}
7036 
7037 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7038   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7039   match(Set dst (AddVD src1 src2));
7040   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
7041   ins_encode %{
7042     int vector_len = 2;
7043     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7044   %}
7045   ins_pipe( pipe_slow );
7046 %}
7047 
7048 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
7049   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7050   match(Set dst (AddVD src (LoadVector mem)));
7051   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
7052   ins_encode %{
7053     int vector_len = 2;
7054     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7055   %}
7056   ins_pipe( pipe_slow );
7057 %}
7058 
7059 // --------------------------------- SUB --------------------------------------
7060 
7061 // Bytes vector sub
7062 instruct vsub4B(vecS dst, vecS src) %{
7063   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7064   match(Set dst (SubVB dst src));
7065   format %{ "psubb   $dst,$src\t! sub packed4B" %}
7066   ins_encode %{
7067     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
7068   %}
7069   ins_pipe( pipe_slow );
7070 %}
7071 
7072 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
7073   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7074   match(Set dst (SubVB src1 src2));
7075   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
7076   ins_encode %{
7077     int vector_len = 0;
7078     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7079   %}
7080   ins_pipe( pipe_slow );
7081 %}
7082 
7083 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
7084   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7085   match(Set dst (SubVB src1 src2));
7086   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
7087   ins_encode %{
7088     int vector_len = 0;
7089     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7090   %}
7091   ins_pipe( pipe_slow );
7092 %}
7093 
7094 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
7095   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7096   match(Set dst (SubVB dst src2));
7097   effect(TEMP src1);
7098   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
7099   ins_encode %{
7100     int vector_len = 0;
7101     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7102   %}
7103   ins_pipe( pipe_slow );
7104 %}
7105 
7106 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
7107   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7108   match(Set dst (SubVB src (LoadVector mem)));
7109   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
7110   ins_encode %{
7111     int vector_len = 0;
7112     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7113   %}
7114   ins_pipe( pipe_slow );
7115 %}
7116 
7117 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
7118   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7119   match(Set dst (SubVB src (LoadVector mem)));
7120   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
7121   ins_encode %{
7122     int vector_len = 0;
7123     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7124   %}
7125   ins_pipe( pipe_slow );
7126 %}
7127 
7128 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
7129   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7130   match(Set dst (SubVB dst (LoadVector mem)));
7131   effect(TEMP src);
7132   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
7133   ins_encode %{
7134     int vector_len = 0;
7135     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7136   %}
7137   ins_pipe( pipe_slow );
7138 %}
7139 
7140 instruct vsub8B(vecD dst, vecD src) %{
7141   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7142   match(Set dst (SubVB dst src));
7143   format %{ "psubb   $dst,$src\t! sub packed8B" %}
7144   ins_encode %{
7145     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
7146   %}
7147   ins_pipe( pipe_slow );
7148 %}
7149 
7150 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
7151   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7152   match(Set dst (SubVB src1 src2));
7153   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
7154   ins_encode %{
7155     int vector_len = 0;
7156     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7157   %}
7158   ins_pipe( pipe_slow );
7159 %}
7160 
7161 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
7162   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7163   match(Set dst (SubVB src1 src2));
7164   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
7165   ins_encode %{
7166     int vector_len = 0;
7167     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7168   %}
7169   ins_pipe( pipe_slow );
7170 %}
7171 
7172 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7173   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7174   match(Set dst (SubVB dst src2));
7175   effect(TEMP src1);
7176   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
7177   ins_encode %{
7178     int vector_len = 0;
7179     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7180   %}
7181   ins_pipe( pipe_slow );
7182 %}
7183 
7184 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
7185   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7186   match(Set dst (SubVB src (LoadVector mem)));
7187   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
7188   ins_encode %{
7189     int vector_len = 0;
7190     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7191   %}
7192   ins_pipe( pipe_slow );
7193 %}
7194 
7195 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
7196   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7197   match(Set dst (SubVB src (LoadVector mem)));
7198   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
7199   ins_encode %{
7200     int vector_len = 0;
7201     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7202   %}
7203   ins_pipe( pipe_slow );
7204 %}
7205 
7206 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
7207   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7208   match(Set dst (SubVB dst (LoadVector mem)));
7209   effect(TEMP src);
7210   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
7211   ins_encode %{
7212     int vector_len = 0;
7213     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7214   %}
7215   ins_pipe( pipe_slow );
7216 %}
7217 
7218 instruct vsub16B(vecX dst, vecX src) %{
7219   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
7220   match(Set dst (SubVB dst src));
7221   format %{ "psubb   $dst,$src\t! sub packed16B" %}
7222   ins_encode %{
7223     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
7224   %}
7225   ins_pipe( pipe_slow );
7226 %}
7227 
7228 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
7229   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
7230   match(Set dst (SubVB src1 src2));
7231   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
7232   ins_encode %{
7233     int vector_len = 0;
7234     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7235   %}
7236   ins_pipe( pipe_slow );
7237 %}
7238 
7239 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
7240   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7241   match(Set dst (SubVB src1 src2));
7242   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
7243   ins_encode %{
7244     int vector_len = 0;
7245     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7246   %}
7247   ins_pipe( pipe_slow );
7248 %}
7249 
7250 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7251   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7252   match(Set dst (SubVB dst src2));
7253   effect(TEMP src1);
7254   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
7255   ins_encode %{
7256     int vector_len = 0;
7257     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7258   %}
7259   ins_pipe( pipe_slow );
7260 %}
7261 
7262 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
7263   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
7264   match(Set dst (SubVB src (LoadVector mem)));
7265   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
7266   ins_encode %{
7267     int vector_len = 0;
7268     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7269   %}
7270   ins_pipe( pipe_slow );
7271 %}
7272 
7273 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
7274   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7275   match(Set dst (SubVB src (LoadVector mem)));
7276   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
7277   ins_encode %{
7278     int vector_len = 0;
7279     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7280   %}
7281   ins_pipe( pipe_slow );
7282 %}
7283 
7284 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
7285   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7286   match(Set dst (SubVB dst (LoadVector mem)));
7287   effect(TEMP src);
7288   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
7289   ins_encode %{
7290     int vector_len = 0;
7291     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7292   %}
7293   ins_pipe( pipe_slow );
7294 %}
7295 
7296 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
7297   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
7298   match(Set dst (SubVB src1 src2));
7299   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
7300   ins_encode %{
7301     int vector_len = 1;
7302     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7303   %}
7304   ins_pipe( pipe_slow );
7305 %}
7306 
7307 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
7308   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7309   match(Set dst (SubVB src1 src2));
7310   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
7311   ins_encode %{
7312     int vector_len = 1;
7313     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7314   %}
7315   ins_pipe( pipe_slow );
7316 %}
7317 
7318 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7319   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
7320   match(Set dst (SubVB dst src2));
7321   effect(TEMP src1);
7322   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
7323   ins_encode %{
7324     int vector_len = 1;
7325     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7326   %}
7327   ins_pipe( pipe_slow );
7328 %}
7329 
7330 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
7331   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
7332   match(Set dst (SubVB src (LoadVector mem)));
7333   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7334   ins_encode %{
7335     int vector_len = 1;
7336     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7337   %}
7338   ins_pipe( pipe_slow );
7339 %}
7340 
7341 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
7342   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7343   match(Set dst (SubVB src (LoadVector mem)));
7344   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7345   ins_encode %{
7346     int vector_len = 1;
7347     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7348   %}
7349   ins_pipe( pipe_slow );
7350 %}
7351 
7352 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
7353   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
7354   match(Set dst (SubVB dst (LoadVector mem)));
7355   effect(TEMP src);
7356   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7357   ins_encode %{
7358     int vector_len = 1;
7359     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7360   %}
7361   ins_pipe( pipe_slow );
7362 %}
7363 
7364 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
7365   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7366   match(Set dst (SubVB src1 src2));
7367   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
7368   ins_encode %{
7369     int vector_len = 2;
7370     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7371   %}
7372   ins_pipe( pipe_slow );
7373 %}
7374 
7375 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
7376   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7377   match(Set dst (SubVB src (LoadVector mem)));
7378   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
7379   ins_encode %{
7380     int vector_len = 2;
7381     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7382   %}
7383   ins_pipe( pipe_slow );
7384 %}
7385 
7386 // Shorts/Chars vector sub
7387 instruct vsub2S(vecS dst, vecS src) %{
7388   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7389   match(Set dst (SubVS dst src));
7390   format %{ "psubw   $dst,$src\t! sub packed2S" %}
7391   ins_encode %{
7392     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7393   %}
7394   ins_pipe( pipe_slow );
7395 %}
7396 
7397 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7398   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7399   match(Set dst (SubVS src1 src2));
7400   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7401   ins_encode %{
7402     int vector_len = 0;
7403     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7404   %}
7405   ins_pipe( pipe_slow );
7406 %}
7407 
7408 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7409   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7410   match(Set dst (SubVS src1 src2));
7411   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7412   ins_encode %{
7413     int vector_len = 0;
7414     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7415   %}
7416   ins_pipe( pipe_slow );
7417 %}
7418 
7419 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
7420   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7421   match(Set dst (SubVS dst src2));
7422   effect(TEMP src1);
7423   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7424   ins_encode %{
7425     int vector_len = 0;
7426     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7427   %}
7428   ins_pipe( pipe_slow );
7429 %}
7430 
7431 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
7432   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7433   match(Set dst (SubVS src (LoadVector mem)));
7434   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7435   ins_encode %{
7436     int vector_len = 0;
7437     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7438   %}
7439   ins_pipe( pipe_slow );
7440 %}
7441 
7442 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
7443   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7444   match(Set dst (SubVS src (LoadVector mem)));
7445   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7446   ins_encode %{
7447     int vector_len = 0;
7448     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7449   %}
7450   ins_pipe( pipe_slow );
7451 %}
7452 
7453 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7454   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7455   match(Set dst (SubVS dst (LoadVector mem)));
7456   effect(TEMP src);
7457   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7458   ins_encode %{
7459     int vector_len = 0;
7460     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7461   %}
7462   ins_pipe( pipe_slow );
7463 %}
7464 
7465 instruct vsub4S(vecD dst, vecD src) %{
7466   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7467   match(Set dst (SubVS dst src));
7468   format %{ "psubw   $dst,$src\t! sub packed4S" %}
7469   ins_encode %{
7470     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7471   %}
7472   ins_pipe( pipe_slow );
7473 %}
7474 
7475 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7476   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7477   match(Set dst (SubVS src1 src2));
7478   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7479   ins_encode %{
7480     int vector_len = 0;
7481     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7482   %}
7483   ins_pipe( pipe_slow );
7484 %}
7485 
7486 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7487   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7488   match(Set dst (SubVS src1 src2));
7489   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7490   ins_encode %{
7491     int vector_len = 0;
7492     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7493   %}
7494   ins_pipe( pipe_slow );
7495 %}
7496 
7497 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7498   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7499   match(Set dst (SubVS dst src2));
7500   effect(TEMP src1);
7501   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7502   ins_encode %{
7503     int vector_len = 0;
7504     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7505   %}
7506   ins_pipe( pipe_slow );
7507 %}
7508 
7509 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
7510   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7511   match(Set dst (SubVS src (LoadVector mem)));
7512   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7513   ins_encode %{
7514     int vector_len = 0;
7515     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7516   %}
7517   ins_pipe( pipe_slow );
7518 %}
7519 
7520 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
7521   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7522   match(Set dst (SubVS src (LoadVector mem)));
7523   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7524   ins_encode %{
7525     int vector_len = 0;
7526     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7527   %}
7528   ins_pipe( pipe_slow );
7529 %}
7530 
7531 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7532   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7533   match(Set dst (SubVS dst (LoadVector mem)));
7534   effect(TEMP src);
7535   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7536   ins_encode %{
7537     int vector_len = 0;
7538     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7539   %}
7540   ins_pipe( pipe_slow );
7541 %}
7542 
7543 instruct vsub8S(vecX dst, vecX src) %{
7544   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7545   match(Set dst (SubVS dst src));
7546   format %{ "psubw   $dst,$src\t! sub packed8S" %}
7547   ins_encode %{
7548     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7549   %}
7550   ins_pipe( pipe_slow );
7551 %}
7552 
7553 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7554   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7555   match(Set dst (SubVS src1 src2));
7556   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7557   ins_encode %{
7558     int vector_len = 0;
7559     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7560   %}
7561   ins_pipe( pipe_slow );
7562 %}
7563 
7564 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7565   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7566   match(Set dst (SubVS src1 src2));
7567   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7568   ins_encode %{
7569     int vector_len = 0;
7570     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7571   %}
7572   ins_pipe( pipe_slow );
7573 %}
7574 
7575 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7576   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7577   match(Set dst (SubVS dst src2));
7578   effect(TEMP src1);
7579   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7580   ins_encode %{
7581     int vector_len = 0;
7582     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7583   %}
7584   ins_pipe( pipe_slow );
7585 %}
7586 
7587 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
7588   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7589   match(Set dst (SubVS src (LoadVector mem)));
7590   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7591   ins_encode %{
7592     int vector_len = 0;
7593     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7594   %}
7595   ins_pipe( pipe_slow );
7596 %}
7597 
7598 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
7599   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7600   match(Set dst (SubVS src (LoadVector mem)));
7601   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7602   ins_encode %{
7603     int vector_len = 0;
7604     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7605   %}
7606   ins_pipe( pipe_slow );
7607 %}
7608 
7609 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7610   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7611   match(Set dst (SubVS dst (LoadVector mem)));
7612   effect(TEMP src);
7613   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7614   ins_encode %{
7615     int vector_len = 0;
7616     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7617   %}
7618   ins_pipe( pipe_slow );
7619 %}
7620 
7621 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7622   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7623   match(Set dst (SubVS src1 src2));
7624   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7625   ins_encode %{
7626     int vector_len = 1;
7627     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7628   %}
7629   ins_pipe( pipe_slow );
7630 %}
7631 
7632 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7633   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7634   match(Set dst (SubVS src1 src2));
7635   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7636   ins_encode %{
7637     int vector_len = 1;
7638     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7639   %}
7640   ins_pipe( pipe_slow );
7641 %}
7642 
7643 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7644   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7645   match(Set dst (SubVS dst src2));
7646   effect(TEMP src1);
7647   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7648   ins_encode %{
7649     int vector_len = 1;
7650     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7651   %}
7652   ins_pipe( pipe_slow );
7653 %}
7654 
7655 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7656   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7657   match(Set dst (SubVS src (LoadVector mem)));
7658   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7659   ins_encode %{
7660     int vector_len = 1;
7661     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7662   %}
7663   ins_pipe( pipe_slow );
7664 %}
7665 
7666 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7667   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7668   match(Set dst (SubVS src (LoadVector mem)));
7669   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7670   ins_encode %{
7671     int vector_len = 1;
7672     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7673   %}
7674   ins_pipe( pipe_slow );
7675 %}
7676 
7677 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7678   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7679   match(Set dst (SubVS dst (LoadVector mem)));
7680    effect(TEMP src);
7681   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7682   ins_encode %{
7683     int vector_len = 1;
7684     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7685   %}
7686   ins_pipe( pipe_slow );
7687 %}
7688 
7689 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7690   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7691   match(Set dst (SubVS src1 src2));
7692   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7693   ins_encode %{
7694     int vector_len = 2;
7695     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7696   %}
7697   ins_pipe( pipe_slow );
7698 %}
7699 
7700 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7701   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7702   match(Set dst (SubVS src (LoadVector mem)));
7703   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7704   ins_encode %{
7705     int vector_len = 2;
7706     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7707   %}
7708   ins_pipe( pipe_slow );
7709 %}
7710 
7711 // Integers vector sub
7712 instruct vsub2I(vecD dst, vecD src) %{
7713   predicate(n->as_Vector()->length() == 2);
7714   match(Set dst (SubVI dst src));
7715   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7716   ins_encode %{
7717     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7718   %}
7719   ins_pipe( pipe_slow );
7720 %}
7721 
7722 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7723   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7724   match(Set dst (SubVI src1 src2));
7725   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7726   ins_encode %{
7727     int vector_len = 0;
7728     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7729   %}
7730   ins_pipe( pipe_slow );
7731 %}
7732 
7733 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7734   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7735   match(Set dst (SubVI src (LoadVector mem)));
7736   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7737   ins_encode %{
7738     int vector_len = 0;
7739     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7740   %}
7741   ins_pipe( pipe_slow );
7742 %}
7743 
7744 instruct vsub4I(vecX dst, vecX src) %{
7745   predicate(n->as_Vector()->length() == 4);
7746   match(Set dst (SubVI dst src));
7747   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7748   ins_encode %{
7749     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7750   %}
7751   ins_pipe( pipe_slow );
7752 %}
7753 
7754 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7755   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7756   match(Set dst (SubVI src1 src2));
7757   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7758   ins_encode %{
7759     int vector_len = 0;
7760     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7761   %}
7762   ins_pipe( pipe_slow );
7763 %}
7764 
7765 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7766   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7767   match(Set dst (SubVI src (LoadVector mem)));
7768   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7769   ins_encode %{
7770     int vector_len = 0;
7771     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7772   %}
7773   ins_pipe( pipe_slow );
7774 %}
7775 
7776 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7777   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7778   match(Set dst (SubVI src1 src2));
7779   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7780   ins_encode %{
7781     int vector_len = 1;
7782     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7783   %}
7784   ins_pipe( pipe_slow );
7785 %}
7786 
7787 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7788   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7789   match(Set dst (SubVI src (LoadVector mem)));
7790   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7791   ins_encode %{
7792     int vector_len = 1;
7793     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7794   %}
7795   ins_pipe( pipe_slow );
7796 %}
7797 
7798 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7799   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7800   match(Set dst (SubVI src1 src2));
7801   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7802   ins_encode %{
7803     int vector_len = 2;
7804     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7805   %}
7806   ins_pipe( pipe_slow );
7807 %}
7808 
7809 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7810   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7811   match(Set dst (SubVI src (LoadVector mem)));
7812   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7813   ins_encode %{
7814     int vector_len = 2;
7815     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7816   %}
7817   ins_pipe( pipe_slow );
7818 %}
7819 
7820 // Longs vector sub
7821 instruct vsub2L(vecX dst, vecX src) %{
7822   predicate(n->as_Vector()->length() == 2);
7823   match(Set dst (SubVL dst src));
7824   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7825   ins_encode %{
7826     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7827   %}
7828   ins_pipe( pipe_slow );
7829 %}
7830 
7831 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7832   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7833   match(Set dst (SubVL src1 src2));
7834   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7835   ins_encode %{
7836     int vector_len = 0;
7837     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7838   %}
7839   ins_pipe( pipe_slow );
7840 %}
7841 
7842 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7843   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7844   match(Set dst (SubVL src (LoadVector mem)));
7845   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7846   ins_encode %{
7847     int vector_len = 0;
7848     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7849   %}
7850   ins_pipe( pipe_slow );
7851 %}
7852 
7853 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7854   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7855   match(Set dst (SubVL src1 src2));
7856   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7857   ins_encode %{
7858     int vector_len = 1;
7859     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7860   %}
7861   ins_pipe( pipe_slow );
7862 %}
7863 
7864 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7865   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7866   match(Set dst (SubVL src (LoadVector mem)));
7867   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7868   ins_encode %{
7869     int vector_len = 1;
7870     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7871   %}
7872   ins_pipe( pipe_slow );
7873 %}
7874 
7875 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7876   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7877   match(Set dst (SubVL src1 src2));
7878   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7879   ins_encode %{
7880     int vector_len = 2;
7881     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7882   %}
7883   ins_pipe( pipe_slow );
7884 %}
7885 
7886 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7887   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7888   match(Set dst (SubVL src (LoadVector mem)));
7889   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7890   ins_encode %{
7891     int vector_len = 2;
7892     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7893   %}
7894   ins_pipe( pipe_slow );
7895 %}
7896 
7897 // Floats vector sub
7898 instruct vsub2F(vecD dst, vecD src) %{
7899   predicate(n->as_Vector()->length() == 2);
7900   match(Set dst (SubVF dst src));
7901   format %{ "subps   $dst,$src\t! sub packed2F" %}
7902   ins_encode %{
7903     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7904   %}
7905   ins_pipe( pipe_slow );
7906 %}
7907 
7908 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7909   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7910   match(Set dst (SubVF src1 src2));
7911   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7912   ins_encode %{
7913     int vector_len = 0;
7914     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7915   %}
7916   ins_pipe( pipe_slow );
7917 %}
7918 
7919 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7920   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7921   match(Set dst (SubVF src (LoadVector mem)));
7922   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7923   ins_encode %{
7924     int vector_len = 0;
7925     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7926   %}
7927   ins_pipe( pipe_slow );
7928 %}
7929 
7930 instruct vsub4F(vecX dst, vecX src) %{
7931   predicate(n->as_Vector()->length() == 4);
7932   match(Set dst (SubVF dst src));
7933   format %{ "subps   $dst,$src\t! sub packed4F" %}
7934   ins_encode %{
7935     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7936   %}
7937   ins_pipe( pipe_slow );
7938 %}
7939 
7940 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7941   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7942   match(Set dst (SubVF src1 src2));
7943   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7944   ins_encode %{
7945     int vector_len = 0;
7946     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7947   %}
7948   ins_pipe( pipe_slow );
7949 %}
7950 
7951 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7952   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7953   match(Set dst (SubVF src (LoadVector mem)));
7954   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7955   ins_encode %{
7956     int vector_len = 0;
7957     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7958   %}
7959   ins_pipe( pipe_slow );
7960 %}
7961 
7962 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7963   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7964   match(Set dst (SubVF src1 src2));
7965   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7966   ins_encode %{
7967     int vector_len = 1;
7968     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7969   %}
7970   ins_pipe( pipe_slow );
7971 %}
7972 
7973 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7974   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7975   match(Set dst (SubVF src (LoadVector mem)));
7976   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7977   ins_encode %{
7978     int vector_len = 1;
7979     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7980   %}
7981   ins_pipe( pipe_slow );
7982 %}
7983 
7984 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7985   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7986   match(Set dst (SubVF src1 src2));
7987   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7988   ins_encode %{
7989     int vector_len = 2;
7990     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7991   %}
7992   ins_pipe( pipe_slow );
7993 %}
7994 
7995 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7996   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7997   match(Set dst (SubVF src (LoadVector mem)));
7998   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7999   ins_encode %{
8000     int vector_len = 2;
8001     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8002   %}
8003   ins_pipe( pipe_slow );
8004 %}
8005 
8006 // Doubles vector sub
8007 instruct vsub2D(vecX dst, vecX src) %{
8008   predicate(n->as_Vector()->length() == 2);
8009   match(Set dst (SubVD dst src));
8010   format %{ "subpd   $dst,$src\t! sub packed2D" %}
8011   ins_encode %{
8012     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
8013   %}
8014   ins_pipe( pipe_slow );
8015 %}
8016 
8017 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
8018   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8019   match(Set dst (SubVD src1 src2));
8020   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
8021   ins_encode %{
8022     int vector_len = 0;
8023     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8024   %}
8025   ins_pipe( pipe_slow );
8026 %}
8027 
8028 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
8029   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8030   match(Set dst (SubVD src (LoadVector mem)));
8031   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
8032   ins_encode %{
8033     int vector_len = 0;
8034     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8035   %}
8036   ins_pipe( pipe_slow );
8037 %}
8038 
8039 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
8040   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8041   match(Set dst (SubVD src1 src2));
8042   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
8043   ins_encode %{
8044     int vector_len = 1;
8045     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8046   %}
8047   ins_pipe( pipe_slow );
8048 %}
8049 
8050 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
8051   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8052   match(Set dst (SubVD src (LoadVector mem)));
8053   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
8054   ins_encode %{
8055     int vector_len = 1;
8056     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8057   %}
8058   ins_pipe( pipe_slow );
8059 %}
8060 
8061 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8062   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8063   match(Set dst (SubVD src1 src2));
8064   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
8065   ins_encode %{
8066     int vector_len = 2;
8067     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8068   %}
8069   ins_pipe( pipe_slow );
8070 %}
8071 
8072 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
8073   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8074   match(Set dst (SubVD src (LoadVector mem)));
8075   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
8076   ins_encode %{
8077     int vector_len = 2;
8078     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8079   %}
8080   ins_pipe( pipe_slow );
8081 %}
8082 
8083 // --------------------------------- MUL --------------------------------------
8084 
8085 // Byte vector mul
8086 
8087 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp2, vecS tmp) %{
8088   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
8089   match(Set dst (MulVB src1 src2));
8090   effect(TEMP dst, TEMP tmp2, TEMP tmp);
8091   format %{"pmovsxbw  $tmp,$src1\n\t"
8092            "pmovsxbw  $tmp2,$src2\n\t"
8093            "pmullw    $tmp,$tmp2\n\t"
8094            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
8095            "pand      $tmp,$tmp2\n\t"
8096            "packuswb  $tmp,$tmp\n\t"
8097            "movss     $dst,$tmp\t! mul packed4B" %}
8098   ins_encode %{
8099     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
8100     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
8101     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
8102     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_byte_saturationmask()));
8103     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
8104     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
8105     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
8106   %}
8107   ins_pipe( pipe_slow );
8108 %}
8109 
8110 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp2, vecD tmp) %{
8111   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
8112   match(Set dst (MulVB src1 src2));
8113   effect(TEMP dst, TEMP tmp2, TEMP tmp);
8114   format %{"pmovsxbw  $tmp,$src1\n\t"
8115            "pmovsxbw  $tmp2,$src2\n\t"
8116            "pmullw    $tmp,$tmp2\n\t"
8117            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
8118            "pand      $tmp,$tmp2\n\t"
8119            "packuswb  $tmp,$tmp\n\t"
8120            "movsd     $dst,$tmp\t! mul packed8B" %}
8121   ins_encode %{
8122     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
8123     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
8124     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
8125     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_byte_saturationmask()));
8126     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
8127     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
8128     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
8129   %}
8130   ins_pipe( pipe_slow );
8131 %}
8132 
8133 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp3, vecX tmp2, vecX tmp) %{
8134   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
8135   match(Set dst (MulVB src1 src2));
8136   effect(TEMP tmp3, TEMP tmp2, TEMP tmp);
8137   format %{"pmovsxbw  $tmp,$src1\n\t"
8138            "pmovsxbw  $tmp2,$src2\n\t"
8139            "pmullw    $tmp,$tmp2\n\t"
8140            "pshufd    $tmp2,$src1\n\t"
8141            "pshufd    $tmp3,$src2\n\t"
8142            "pmovsxbw  $tmp2,$tmp2\n\t"
8143            "pmovsxbw  $tmp3,$tmp3\n\t"
8144            "pmullw    $tmp2,$tmp3\n\t"
8145            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
8146            "pand      $tmp,$tmp3\n\t"
8147            "pand      $tmp2,$tmp3\n\t"
8148            "packuswb  $tmp,$tmp2\n\t"
8149            "movdqu    $dst,$tmp \n\t! mul packed16B" %}
8150   ins_encode %{
8151     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
8152     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
8153     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
8154     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 238);
8155     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 238);
8156     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
8157     __ pmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister);
8158     __ pmullw($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8159     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_byte_saturationmask()));
8160     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
8161     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8162     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
8163     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
8164   %}
8165   ins_pipe( pipe_slow );
8166 %}
8167 
8168 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecY tmp2, vecY tmp) %{
8169   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8170   match(Set dst (MulVB src1 src2));
8171   effect(TEMP dst, TEMP tmp2, TEMP tmp);
8172   format %{"vpmovsxbw  $tmp,$src1\n\t"
8173            "vpmovsxbw  $tmp2,$src2\n\t"
8174            "vpmullw    $tmp,$tmp2\n\t"
8175            "vmovdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
8176            "vpand      $tmp,$tmp2\n\t"
8177            "vextracti128_high  $tmp2,$tmp\n\t"
8178            "vpackuswb  $dst,$tmp, $tmp2\n\t! mul packed16B" %}
8179   ins_encode %{
8180   int vector_len = 1;
8181     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
8182     __ vpmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8183     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8184     __ vmovdqu($tmp2$$XMMRegister, ExternalAddress(vector_byte_saturationmask()));
8185     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8186     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
8187     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
8188   %}
8189   ins_pipe( pipe_slow );
8190 %}
8191 
8192 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, vecY tmp3) %{
8193   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
8194   match(Set dst (MulVB src1 src2));
8195   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
8196   format %{"vextracti128_high  $tmp1,$src1\n\t"
8197            "vextracti128_high  $tmp3,$src2\n\t"
8198            "vpmovsxbw $tmp1,$tmp1\n\t"
8199            "vpmovsxbw $tmp3,$tmp3\n\t"
8200            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
8201            "vpmovsxbw $tmp2,$src1\n\t"
8202            "vpmovsxbw $tmp3,$src2\n\t"
8203            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
8204            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
8205            "vpbroadcastd $tmp3, $tmp3\n\t"
8206            "vpand $tmp2,$tmp2,$tmp3\n\t"
8207            "vpand $tmp1,$tmp1,$tmp3\n\t"
8208            "vpackuswb $dst,$tmp2,$tmp1\n\t"
8209            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
8210   ins_encode %{
8211     int vector_len = 1;
8212     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
8213     __ vextracti128_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8214     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8215     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8216     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8217     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
8218     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
8219     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8220     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_byte_saturationmask()));
8221     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister);
8222     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8223     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8224     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8225     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
8226   %}
8227   ins_pipe( pipe_slow );
8228 %}
8229 
8230 // Shorts/Chars vector mul
8231 instruct vmul2S(vecS dst, vecS src) %{
8232   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8233   match(Set dst (MulVS dst src));
8234   format %{ "pmullw $dst,$src\t! mul packed2S" %}
8235   ins_encode %{
8236     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
8237   %}
8238   ins_pipe( pipe_slow );
8239 %}
8240 
8241 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
8242   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8243   match(Set dst (MulVS src1 src2));
8244   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
8245   ins_encode %{
8246     int vector_len = 0;
8247     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8248   %}
8249   ins_pipe( pipe_slow );
8250 %}
8251 
8252 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
8253   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8254   match(Set dst (MulVS src1 src2));
8255   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
8256   ins_encode %{
8257     int vector_len = 0;
8258     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8259   %}
8260   ins_pipe( pipe_slow );
8261 %}
8262 
8263 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
8264   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8265   match(Set dst (MulVS dst src2));
8266   effect(TEMP src1);
8267   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
8268   ins_encode %{
8269     int vector_len = 0;
8270     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8271   %}
8272   ins_pipe( pipe_slow );
8273 %}
8274 
8275 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
8276   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8277   match(Set dst (MulVS src (LoadVector mem)));
8278   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
8279   ins_encode %{
8280     int vector_len = 0;
8281     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8282   %}
8283   ins_pipe( pipe_slow );
8284 %}
8285 
8286 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
8287   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8288   match(Set dst (MulVS src (LoadVector mem)));
8289   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
8290   ins_encode %{
8291     int vector_len = 0;
8292     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8293   %}
8294   ins_pipe( pipe_slow );
8295 %}
8296 
8297 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
8298   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8299   match(Set dst (MulVS dst (LoadVector mem)));
8300   effect(TEMP src);
8301   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
8302   ins_encode %{
8303     int vector_len = 0;
8304     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8305   %}
8306   ins_pipe( pipe_slow );
8307 %}
8308 
8309 instruct vmul4S(vecD dst, vecD src) %{
8310   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8311   match(Set dst (MulVS dst src));
8312   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
8313   ins_encode %{
8314     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
8315   %}
8316   ins_pipe( pipe_slow );
8317 %}
8318 
8319 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
8320   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8321   match(Set dst (MulVS src1 src2));
8322   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
8323   ins_encode %{
8324     int vector_len = 0;
8325     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8326   %}
8327   ins_pipe( pipe_slow );
8328 %}
8329 
8330 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
8331   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8332   match(Set dst (MulVS src1 src2));
8333   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
8334   ins_encode %{
8335     int vector_len = 0;
8336     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8337   %}
8338   ins_pipe( pipe_slow );
8339 %}
8340 
8341 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
8342   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8343   match(Set dst (MulVS dst src2));
8344   effect(TEMP src1);
8345   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
8346   ins_encode %{
8347     int vector_len = 0;
8348     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8349   %}
8350   ins_pipe( pipe_slow );
8351 %}
8352 
8353 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
8354   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8355   match(Set dst (MulVS src (LoadVector mem)));
8356   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
8357   ins_encode %{
8358     int vector_len = 0;
8359     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8360   %}
8361   ins_pipe( pipe_slow );
8362 %}
8363 
8364 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
8365   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8366   match(Set dst (MulVS src (LoadVector mem)));
8367   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
8368   ins_encode %{
8369     int vector_len = 0;
8370     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8371   %}
8372   ins_pipe( pipe_slow );
8373 %}
8374 
8375 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
8376   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8377   match(Set dst (MulVS dst (LoadVector mem)));
8378   effect(TEMP src);
8379   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
8380   ins_encode %{
8381     int vector_len = 0;
8382     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8383   %}
8384   ins_pipe( pipe_slow );
8385 %}
8386 
8387 instruct vmul8S(vecX dst, vecX src) %{
8388   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8389   match(Set dst (MulVS dst src));
8390   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
8391   ins_encode %{
8392     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
8393   %}
8394   ins_pipe( pipe_slow );
8395 %}
8396 
8397 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
8398   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8399   match(Set dst (MulVS src1 src2));
8400   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
8401   ins_encode %{
8402     int vector_len = 0;
8403     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8404   %}
8405   ins_pipe( pipe_slow );
8406 %}
8407 
8408 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
8409   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8410   match(Set dst (MulVS src1 src2));
8411   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
8412   ins_encode %{
8413     int vector_len = 0;
8414     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8415   %}
8416   ins_pipe( pipe_slow );
8417 %}
8418 
8419 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
8420   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8421   match(Set dst (MulVS dst src2));
8422   effect(TEMP src1);
8423   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
8424   ins_encode %{
8425     int vector_len = 0;
8426     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8427   %}
8428   ins_pipe( pipe_slow );
8429 %}
8430 
8431 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
8432   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8433   match(Set dst (MulVS src (LoadVector mem)));
8434   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
8435   ins_encode %{
8436     int vector_len = 0;
8437     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8438   %}
8439   ins_pipe( pipe_slow );
8440 %}
8441 
8442 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
8443   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8444   match(Set dst (MulVS src (LoadVector mem)));
8445   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
8446   ins_encode %{
8447     int vector_len = 0;
8448     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8449   %}
8450   ins_pipe( pipe_slow );
8451 %}
8452 
8453 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
8454   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8455   match(Set dst (MulVS dst (LoadVector mem)));
8456   effect(TEMP src);
8457   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
8458   ins_encode %{
8459     int vector_len = 0;
8460     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8461   %}
8462   ins_pipe( pipe_slow );
8463 %}
8464 
8465 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
8466   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8467   match(Set dst (MulVS src1 src2));
8468   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8469   ins_encode %{
8470     int vector_len = 1;
8471     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8472   %}
8473   ins_pipe( pipe_slow );
8474 %}
8475 
8476 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
8477   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8478   match(Set dst (MulVS src1 src2));
8479   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8480   ins_encode %{
8481     int vector_len = 1;
8482     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8483   %}
8484   ins_pipe( pipe_slow );
8485 %}
8486 
8487 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
8488   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8489   match(Set dst (MulVS dst src2));
8490   effect(TEMP src1);
8491   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
8492   ins_encode %{
8493     int vector_len = 1;
8494     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8495   %}
8496   ins_pipe( pipe_slow );
8497 %}
8498 
8499 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
8500   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8501   match(Set dst (MulVS src (LoadVector mem)));
8502   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8503   ins_encode %{
8504     int vector_len = 1;
8505     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8506   %}
8507   ins_pipe( pipe_slow );
8508 %}
8509 
8510 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
8511   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8512   match(Set dst (MulVS src (LoadVector mem)));
8513   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8514   ins_encode %{
8515     int vector_len = 1;
8516     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8517   %}
8518   ins_pipe( pipe_slow );
8519 %}
8520 
8521 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
8522   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8523   match(Set dst (MulVS dst (LoadVector mem)));
8524   effect(TEMP src);
8525   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
8526   ins_encode %{
8527     int vector_len = 1;
8528     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8529   %}
8530   ins_pipe( pipe_slow );
8531 %}
8532 
8533 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
8534   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8535   match(Set dst (MulVS src1 src2));
8536   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
8537   ins_encode %{
8538     int vector_len = 2;
8539     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8540   %}
8541   ins_pipe( pipe_slow );
8542 %}
8543 
8544 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
8545   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8546   match(Set dst (MulVS src (LoadVector mem)));
8547   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
8548   ins_encode %{
8549     int vector_len = 2;
8550     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8551   %}
8552   ins_pipe( pipe_slow );
8553 %}
8554 
8555 // Integers vector mul (sse4_1)
8556 instruct vmul2I(vecD dst, vecD src) %{
8557   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
8558   match(Set dst (MulVI dst src));
8559   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
8560   ins_encode %{
8561     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
8562   %}
8563   ins_pipe( pipe_slow );
8564 %}
8565 
8566 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
8567   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8568   match(Set dst (MulVI src1 src2));
8569   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
8570   ins_encode %{
8571     int vector_len = 0;
8572     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8573   %}
8574   ins_pipe( pipe_slow );
8575 %}
8576 
8577 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
8578   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8579   match(Set dst (MulVI src (LoadVector mem)));
8580   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
8581   ins_encode %{
8582     int vector_len = 0;
8583     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8584   %}
8585   ins_pipe( pipe_slow );
8586 %}
8587 
8588 instruct vmul4I(vecX dst, vecX src) %{
8589   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
8590   match(Set dst (MulVI dst src));
8591   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
8592   ins_encode %{
8593     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
8594   %}
8595   ins_pipe( pipe_slow );
8596 %}
8597 
8598 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
8599   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8600   match(Set dst (MulVI src1 src2));
8601   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
8602   ins_encode %{
8603     int vector_len = 0;
8604     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8605   %}
8606   ins_pipe( pipe_slow );
8607 %}
8608 
8609 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
8610   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8611   match(Set dst (MulVI src (LoadVector mem)));
8612   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
8613   ins_encode %{
8614     int vector_len = 0;
8615     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8616   %}
8617   ins_pipe( pipe_slow );
8618 %}
8619 
8620 // Long vector mul
8621 
8622 instruct mul2L_reg(vecX dst, vecX src2, vecX tmp) %{
8623   predicate(UseSSE > 3 && n->as_Vector()->length() == 2 && VM_Version::supports_sse4_1());
8624   match(Set dst (MulVL dst src2));
8625   effect(TEMP dst, TEMP tmp);
8626   format %{ "pshufd $tmp,$src2, 177\n\t"
8627             "pmulld $tmp,$dst\n\t"
8628             "phaddd $tmp,$tmp\n\t"
8629             "pmovzxdq $tmp,$tmp\n\t"
8630             "psllq $tmp, 32\n\t"
8631             "pmuludq $dst,$src2\n\t"
8632             "paddq $dst,$tmp\n\t! mul packed2L" %} 
8633 
8634   ins_encode %{
8635     int vector_len = 0;
8636     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
8637     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
8638     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
8639     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
8640     __ psllq($tmp$$XMMRegister, 32);
8641     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
8642     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
8643   %}
8644   ins_pipe( pipe_slow );
8645 %}
8646 
8647 instruct vmul2L_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp) %{
8648   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && VM_Version::supports_avx());
8649   match(Set dst (MulVL src1 src2));
8650   effect(TEMP tmp1, TEMP tmp);
8651   format %{ "vpshufd $tmp,$src2\n\t"
8652             "vpmulld $tmp,$src1,$tmp\n\t"
8653             "vphaddd $tmp,$tmp,$tmp\n\t"
8654             "vpmovzxdq $tmp,$tmp\n\t"
8655             "vpsllq $tmp,$tmp\n\t"
8656             "vpmuludq $tmp1,$src1,$src2\n\t"
8657             "vpaddq $dst,$tmp,$tmp1\t! mul packed2L" %}
8658   ins_encode %{
8659     int vector_len = 0;
8660     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
8661     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
8662     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8663     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8664     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
8665     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8666     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8667   %}
8668   ins_pipe( pipe_slow );
8669 %}
8670 
8671 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
8672   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
8673   match(Set dst (MulVL src1 src2));
8674   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
8675   ins_encode %{
8676     int vector_len = 0;
8677     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8678   %}
8679   ins_pipe( pipe_slow );
8680 %}
8681 
8682 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
8683   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
8684   match(Set dst (MulVL src (LoadVector mem)));
8685   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
8686   ins_encode %{
8687     int vector_len = 0;
8688     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8689   %}
8690   ins_pipe( pipe_slow );
8691 %}
8692 
8693 instruct vmul4L_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp, vecY tmp1,) %{
8694   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && VM_Version::supports_avx2());
8695   match(Set dst (MulVL src1 src2));
8696   effect(TEMP tmp1, TEMP tmp);
8697   format %{ "vpshufd $tmp,$src2\n\t"
8698             "vpmulld $tmp,$src1,$tmp\n\t"
8699             "vphaddd $tmp,$tmp,$tmp\n\t"
8700             "vpmovzxdq $tmp,$tmp\n\t"
8701             "vpsllq $tmp,$tmp\n\t"
8702             "vpmuludq $tmp1,$src1,$src2\n\t"
8703             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
8704   ins_encode %{
8705     int vector_len = 1;
8706     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
8707     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
8708     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8709     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8710     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
8711     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8712     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8713   %}
8714   ins_pipe( pipe_slow );
8715 %}
8716 
8717 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
8718   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
8719   match(Set dst (MulVL src1 src2));
8720   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
8721   ins_encode %{
8722     int vector_len = 1;
8723     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8724   %}
8725   ins_pipe( pipe_slow );
8726 %}
8727 
8728 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
8729   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
8730   match(Set dst (MulVL src (LoadVector mem)));
8731   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
8732   ins_encode %{
8733     int vector_len = 1;
8734     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8735   %}
8736   ins_pipe( pipe_slow );
8737 %}
8738 
8739 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
8740   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8741   match(Set dst (MulVL src1 src2));
8742   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
8743   ins_encode %{
8744     int vector_len = 2;
8745     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8746   %}
8747   ins_pipe( pipe_slow );
8748 %}
8749 
8750 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
8751   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8752   match(Set dst (MulVL src (LoadVector mem)));
8753   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
8754   ins_encode %{
8755     int vector_len = 2;
8756     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8757   %}
8758   ins_pipe( pipe_slow );
8759 %}
8760 
8761 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
8762   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8763   match(Set dst (MulVI src1 src2));
8764   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
8765   ins_encode %{
8766     int vector_len = 1;
8767     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8768   %}
8769   ins_pipe( pipe_slow );
8770 %}
8771 
8772 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
8773   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8774   match(Set dst (MulVI src (LoadVector mem)));
8775   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
8776   ins_encode %{
8777     int vector_len = 1;
8778     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8779   %}
8780   ins_pipe( pipe_slow );
8781 %}
8782 
8783 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
8784   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8785   match(Set dst (MulVI src1 src2));
8786   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
8787   ins_encode %{
8788     int vector_len = 2;
8789     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8790   %}
8791   ins_pipe( pipe_slow );
8792 %}
8793 
8794 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
8795   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8796   match(Set dst (MulVI src (LoadVector mem)));
8797   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
8798   ins_encode %{
8799     int vector_len = 2;
8800     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8801   %}
8802   ins_pipe( pipe_slow );
8803 %}
8804 
8805 // Floats vector mul
8806 instruct vmul2F(vecD dst, vecD src) %{
8807   predicate(n->as_Vector()->length() == 2);
8808   match(Set dst (MulVF dst src));
8809   format %{ "mulps   $dst,$src\t! mul packed2F" %}
8810   ins_encode %{
8811     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8812   %}
8813   ins_pipe( pipe_slow );
8814 %}
8815 
8816 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
8817   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8818   match(Set dst (MulVF src1 src2));
8819   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
8820   ins_encode %{
8821     int vector_len = 0;
8822     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8823   %}
8824   ins_pipe( pipe_slow );
8825 %}
8826 
8827 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
8828   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8829   match(Set dst (MulVF src (LoadVector mem)));
8830   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
8831   ins_encode %{
8832     int vector_len = 0;
8833     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8834   %}
8835   ins_pipe( pipe_slow );
8836 %}
8837 
8838 instruct vmul4F(vecX dst, vecX src) %{
8839   predicate(n->as_Vector()->length() == 4);
8840   match(Set dst (MulVF dst src));
8841   format %{ "mulps   $dst,$src\t! mul packed4F" %}
8842   ins_encode %{
8843     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8844   %}
8845   ins_pipe( pipe_slow );
8846 %}
8847 
8848 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8849   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8850   match(Set dst (MulVF src1 src2));
8851   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8852   ins_encode %{
8853     int vector_len = 0;
8854     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8855   %}
8856   ins_pipe( pipe_slow );
8857 %}
8858 
8859 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8860   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8861   match(Set dst (MulVF src (LoadVector mem)));
8862   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8863   ins_encode %{
8864     int vector_len = 0;
8865     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8866   %}
8867   ins_pipe( pipe_slow );
8868 %}
8869 
8870 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8871   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8872   match(Set dst (MulVF src1 src2));
8873   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8874   ins_encode %{
8875     int vector_len = 1;
8876     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8877   %}
8878   ins_pipe( pipe_slow );
8879 %}
8880 
8881 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8882   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8883   match(Set dst (MulVF src (LoadVector mem)));
8884   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8885   ins_encode %{
8886     int vector_len = 1;
8887     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8888   %}
8889   ins_pipe( pipe_slow );
8890 %}
8891 
8892 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8893   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8894   match(Set dst (MulVF src1 src2));
8895   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8896   ins_encode %{
8897     int vector_len = 2;
8898     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8899   %}
8900   ins_pipe( pipe_slow );
8901 %}
8902 
8903 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8904   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8905   match(Set dst (MulVF src (LoadVector mem)));
8906   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8907   ins_encode %{
8908     int vector_len = 2;
8909     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8910   %}
8911   ins_pipe( pipe_slow );
8912 %}
8913 
8914 // Doubles vector mul
8915 instruct vmul2D(vecX dst, vecX src) %{
8916   predicate(n->as_Vector()->length() == 2);
8917   match(Set dst (MulVD dst src));
8918   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8919   ins_encode %{
8920     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8921   %}
8922   ins_pipe( pipe_slow );
8923 %}
8924 
8925 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8926   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8927   match(Set dst (MulVD src1 src2));
8928   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8929   ins_encode %{
8930     int vector_len = 0;
8931     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8932   %}
8933   ins_pipe( pipe_slow );
8934 %}
8935 
8936 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8937   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8938   match(Set dst (MulVD src (LoadVector mem)));
8939   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8940   ins_encode %{
8941     int vector_len = 0;
8942     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8943   %}
8944   ins_pipe( pipe_slow );
8945 %}
8946 
8947 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8948   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8949   match(Set dst (MulVD src1 src2));
8950   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8951   ins_encode %{
8952     int vector_len = 1;
8953     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8954   %}
8955   ins_pipe( pipe_slow );
8956 %}
8957 
8958 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8959   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8960   match(Set dst (MulVD src (LoadVector mem)));
8961   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8962   ins_encode %{
8963     int vector_len = 1;
8964     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8965   %}
8966   ins_pipe( pipe_slow );
8967 %}
8968 
8969 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8970   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8971   match(Set dst (MulVD src1 src2));
8972   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8973   ins_encode %{
8974     int vector_len = 2;
8975     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8976   %}
8977   ins_pipe( pipe_slow );
8978 %}
8979 
8980 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8981   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8982   match(Set dst (MulVD src (LoadVector mem)));
8983   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8984   ins_encode %{
8985     int vector_len = 2;
8986     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8987   %}
8988   ins_pipe( pipe_slow );
8989 %}
8990 
8991 instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8992   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
8993   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
8994   effect(TEMP dst, USE src1, USE src2);
8995   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
8996             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
8997          %}
8998   ins_encode %{
8999     int vector_len = 1;
9000     int cond = (Assembler::Condition)($copnd$$cmpcode);
9001     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
9002     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
9003   %}
9004   ins_pipe( pipe_slow );
9005 %}
9006 
9007 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
9008   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
9009   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
9010   effect(TEMP dst, USE src1, USE src2);
9011   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
9012             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
9013          %}
9014   ins_encode %{
9015     int vector_len = 1;
9016     int cond = (Assembler::Condition)($copnd$$cmpcode);
9017     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
9018     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
9019   %}
9020   ins_pipe( pipe_slow );
9021 %}
9022 
9023 // --------------------------------- DIV --------------------------------------
9024 
9025 // Floats vector div
9026 instruct vdiv2F(vecD dst, vecD src) %{
9027   predicate(n->as_Vector()->length() == 2);
9028   match(Set dst (DivVF dst src));
9029   format %{ "divps   $dst,$src\t! div packed2F" %}
9030   ins_encode %{
9031     __ divps($dst$$XMMRegister, $src$$XMMRegister);
9032   %}
9033   ins_pipe( pipe_slow );
9034 %}
9035 
9036 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
9037   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9038   match(Set dst (DivVF src1 src2));
9039   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
9040   ins_encode %{
9041     int vector_len = 0;
9042     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9043   %}
9044   ins_pipe( pipe_slow );
9045 %}
9046 
9047 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
9048   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9049   match(Set dst (DivVF src (LoadVector mem)));
9050   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
9051   ins_encode %{
9052     int vector_len = 0;
9053     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9054   %}
9055   ins_pipe( pipe_slow );
9056 %}
9057 
9058 instruct vdiv4F(vecX dst, vecX src) %{
9059   predicate(n->as_Vector()->length() == 4);
9060   match(Set dst (DivVF dst src));
9061   format %{ "divps   $dst,$src\t! div packed4F" %}
9062   ins_encode %{
9063     __ divps($dst$$XMMRegister, $src$$XMMRegister);
9064   %}
9065   ins_pipe( pipe_slow );
9066 %}
9067 
9068 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
9069   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9070   match(Set dst (DivVF src1 src2));
9071   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
9072   ins_encode %{
9073     int vector_len = 0;
9074     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9075   %}
9076   ins_pipe( pipe_slow );
9077 %}
9078 
9079 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
9080   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9081   match(Set dst (DivVF src (LoadVector mem)));
9082   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
9083   ins_encode %{
9084     int vector_len = 0;
9085     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
9091   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9092   match(Set dst (DivVF src1 src2));
9093   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
9094   ins_encode %{
9095     int vector_len = 1;
9096     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9097   %}
9098   ins_pipe( pipe_slow );
9099 %}
9100 
9101 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
9102   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9103   match(Set dst (DivVF src (LoadVector mem)));
9104   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
9105   ins_encode %{
9106     int vector_len = 1;
9107     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9108   %}
9109   ins_pipe( pipe_slow );
9110 %}
9111 
9112 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
9113   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
9114   match(Set dst (DivVF src1 src2));
9115   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
9116   ins_encode %{
9117     int vector_len = 2;
9118     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9119   %}
9120   ins_pipe( pipe_slow );
9121 %}
9122 
9123 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
9124   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
9125   match(Set dst (DivVF src (LoadVector mem)));
9126   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
9127   ins_encode %{
9128     int vector_len = 2;
9129     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9130   %}
9131   ins_pipe( pipe_slow );
9132 %}
9133 
9134 // Doubles vector div
9135 instruct vdiv2D(vecX dst, vecX src) %{
9136   predicate(n->as_Vector()->length() == 2);
9137   match(Set dst (DivVD dst src));
9138   format %{ "divpd   $dst,$src\t! div packed2D" %}
9139   ins_encode %{
9140     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
9141   %}
9142   ins_pipe( pipe_slow );
9143 %}
9144 
9145 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
9146   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9147   match(Set dst (DivVD src1 src2));
9148   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
9149   ins_encode %{
9150     int vector_len = 0;
9151     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9152   %}
9153   ins_pipe( pipe_slow );
9154 %}
9155 
9156 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
9157   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9158   match(Set dst (DivVD src (LoadVector mem)));
9159   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
9160   ins_encode %{
9161     int vector_len = 0;
9162     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9163   %}
9164   ins_pipe( pipe_slow );
9165 %}
9166 
9167 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
9168   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9169   match(Set dst (DivVD src1 src2));
9170   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
9171   ins_encode %{
9172     int vector_len = 1;
9173     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9174   %}
9175   ins_pipe( pipe_slow );
9176 %}
9177 
9178 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
9179   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9180   match(Set dst (DivVD src (LoadVector mem)));
9181   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
9182   ins_encode %{
9183     int vector_len = 1;
9184     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9185   %}
9186   ins_pipe( pipe_slow );
9187 %}
9188 
9189 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
9190   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9191   match(Set dst (DivVD src1 src2));
9192   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
9193   ins_encode %{
9194     int vector_len = 2;
9195     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9196   %}
9197   ins_pipe( pipe_slow );
9198 %}
9199 
9200 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
9201   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9202   match(Set dst (DivVD src (LoadVector mem)));
9203   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
9204   ins_encode %{
9205     int vector_len = 2;
9206     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9207   %}
9208   ins_pipe( pipe_slow );
9209 %}
9210 
9211 // ------------------------------ Shift ---------------------------------------
9212 
9213 // Left and right shift count vectors are the same on x86
9214 // (only lowest bits of xmm reg are used for count).
9215 instruct vshiftcnt(vecS dst, rRegI cnt) %{
9216   match(Set dst (LShiftCntV cnt));
9217   match(Set dst (RShiftCntV cnt));
9218   format %{ "movd    $dst,$cnt\t! load shift count" %}
9219   ins_encode %{
9220     __ movdl($dst$$XMMRegister, $cnt$$Register);
9221   %}
9222   ins_pipe( pipe_slow );
9223 %}
9224 
9225 // --------------------------------- Sqrt --------------------------------------
9226 
9227 // Floating point vector sqrt
9228 instruct vsqrt2D_reg(vecX dst, vecX src) %{
9229   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9230   match(Set dst (SqrtVD src));
9231   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
9232   ins_encode %{
9233     int vector_len = 0;
9234     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9235   %}
9236   ins_pipe( pipe_slow );
9237 %}
9238 
9239 instruct vsqrt2D_mem(vecX dst, memory mem) %{
9240   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9241   match(Set dst (SqrtVD (LoadVector mem)));
9242   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
9243   ins_encode %{
9244     int vector_len = 0;
9245     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
9246   %}
9247   ins_pipe( pipe_slow );
9248 %}
9249 
9250 instruct vsqrt4D_reg(vecY dst, vecY src) %{
9251   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9252   match(Set dst (SqrtVD src));
9253   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
9254   ins_encode %{
9255     int vector_len = 1;
9256     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9257   %}
9258   ins_pipe( pipe_slow );
9259 %}
9260 
9261 instruct vsqrt4D_mem(vecY dst, memory mem) %{
9262   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9263   match(Set dst (SqrtVD (LoadVector mem)));
9264   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
9265   ins_encode %{
9266     int vector_len = 1;
9267     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
9268   %}
9269   ins_pipe( pipe_slow );
9270 %}
9271 
9272 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
9273   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9274   match(Set dst (SqrtVD src));
9275   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
9276   ins_encode %{
9277     int vector_len = 2;
9278     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9279   %}
9280   ins_pipe( pipe_slow );
9281 %}
9282 
9283 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
9284   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9285   match(Set dst (SqrtVD (LoadVector mem)));
9286   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
9287   ins_encode %{
9288     int vector_len = 2;
9289     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
9290   %}
9291   ins_pipe( pipe_slow );
9292 %}
9293 
9294 instruct vsqrt2F_reg(vecD dst, vecD src) %{
9295   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9296   match(Set dst (SqrtVF src));
9297   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
9298   ins_encode %{
9299     int vector_len = 0;
9300     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9301   %}
9302   ins_pipe( pipe_slow );
9303 %}
9304 
9305 instruct vsqrt2F_mem(vecD dst, memory mem) %{
9306   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9307   match(Set dst (SqrtVF (LoadVector mem)));
9308   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
9309   ins_encode %{
9310     int vector_len = 0;
9311     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
9312   %}
9313   ins_pipe( pipe_slow );
9314 %}
9315 
9316 instruct vsqrt4F_reg(vecX dst, vecX src) %{
9317   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9318   match(Set dst (SqrtVF src));
9319   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
9320   ins_encode %{
9321     int vector_len = 0;
9322     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9323   %}
9324   ins_pipe( pipe_slow );
9325 %}
9326 
9327 instruct vsqrt4F_mem(vecX dst, memory mem) %{
9328   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9329   match(Set dst (SqrtVF (LoadVector mem)));
9330   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
9331   ins_encode %{
9332     int vector_len = 0;
9333     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
9334   %}
9335   ins_pipe( pipe_slow );
9336 %}
9337 
9338 instruct vsqrt8F_reg(vecY dst, vecY src) %{
9339   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9340   match(Set dst (SqrtVF src));
9341   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
9342   ins_encode %{
9343     int vector_len = 1;
9344     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9345   %}
9346   ins_pipe( pipe_slow );
9347 %}
9348 
9349 instruct vsqrt8F_mem(vecY dst, memory mem) %{
9350   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9351   match(Set dst (SqrtVF (LoadVector mem)));
9352   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
9353   ins_encode %{
9354     int vector_len = 1;
9355     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
9356   %}
9357   ins_pipe( pipe_slow );
9358 %}
9359 
9360 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
9361   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9362   match(Set dst (SqrtVF src));
9363   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
9364   ins_encode %{
9365     int vector_len = 2;
9366     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9367   %}
9368   ins_pipe( pipe_slow );
9369 %}
9370 
9371 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
9372   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9373   match(Set dst (SqrtVF (LoadVector mem)));
9374   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
9375   ins_encode %{
9376     int vector_len = 2;
9377     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
9378   %}
9379   ins_pipe( pipe_slow );
9380 %}
9381 
9382 // ------------------------------ LeftShift -----------------------------------
9383 
9384 // Shorts/Chars vector left shift
9385 instruct vsll2S(vecS dst, vecS shift) %{
9386   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9387   match(Set dst (LShiftVS dst shift));
9388   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
9389   ins_encode %{
9390     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
9391   %}
9392   ins_pipe( pipe_slow );
9393 %}
9394 
9395 instruct vsll2S_imm(vecS dst, immI8 shift) %{
9396   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9397   match(Set dst (LShiftVS dst shift));
9398   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
9399   ins_encode %{
9400     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
9401   %}
9402   ins_pipe( pipe_slow );
9403 %}
9404 
9405 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9406   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9407   match(Set dst (LShiftVS src shift));
9408   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
9409   ins_encode %{
9410     int vector_len = 0;
9411     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9412   %}
9413   ins_pipe( pipe_slow );
9414 %}
9415 
9416 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9417   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9418   match(Set dst (LShiftVS src shift));
9419   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
9420   ins_encode %{
9421     int vector_len = 0;
9422     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9423   %}
9424   ins_pipe( pipe_slow );
9425 %}
9426 
9427 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9428   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9429   match(Set dst (LShiftVS dst shift));
9430   effect(TEMP src);
9431   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
9432   ins_encode %{
9433     int vector_len = 0;
9434     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9435   %}
9436   ins_pipe( pipe_slow );
9437 %}
9438 
9439 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9440   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9441   match(Set dst (LShiftVS src shift));
9442   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
9443   ins_encode %{
9444     int vector_len = 0;
9445     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9446   %}
9447   ins_pipe( pipe_slow );
9448 %}
9449 
9450 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9451   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9452   match(Set dst (LShiftVS src shift));
9453   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
9454   ins_encode %{
9455     int vector_len = 0;
9456     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9457   %}
9458   ins_pipe( pipe_slow );
9459 %}
9460 
9461 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9462   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9463   match(Set dst (LShiftVS dst shift));
9464   effect(TEMP src);
9465   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
9466   ins_encode %{
9467     int vector_len = 0;
9468     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9469   %}
9470   ins_pipe( pipe_slow );
9471 %}
9472 
9473 instruct vsll4S(vecD dst, vecS shift) %{
9474   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9475   match(Set dst (LShiftVS dst shift));
9476   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
9477   ins_encode %{
9478     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
9479   %}
9480   ins_pipe( pipe_slow );
9481 %}
9482 
9483 instruct vsll4S_imm(vecD dst, immI8 shift) %{
9484   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9485   match(Set dst (LShiftVS dst shift));
9486   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
9487   ins_encode %{
9488     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
9489   %}
9490   ins_pipe( pipe_slow );
9491 %}
9492 
9493 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9494   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9495   match(Set dst (LShiftVS src shift));
9496   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
9497   ins_encode %{
9498     int vector_len = 0;
9499     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9500   %}
9501   ins_pipe( pipe_slow );
9502 %}
9503 
9504 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9505   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9506   match(Set dst (LShiftVS src shift));
9507   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
9508   ins_encode %{
9509     int vector_len = 0;
9510     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9511   %}
9512   ins_pipe( pipe_slow );
9513 %}
9514 
9515 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9516   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9517   match(Set dst (LShiftVS dst shift));
9518   effect(TEMP src);
9519   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
9520   ins_encode %{
9521     int vector_len = 0;
9522     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9523   %}
9524   ins_pipe( pipe_slow );
9525 %}
9526 
9527 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9528   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9529   match(Set dst (LShiftVS src shift));
9530   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
9531   ins_encode %{
9532     int vector_len = 0;
9533     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9534   %}
9535   ins_pipe( pipe_slow );
9536 %}
9537 
9538 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9539   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9540   match(Set dst (LShiftVS src shift));
9541   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
9542   ins_encode %{
9543     int vector_len = 0;
9544     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9545   %}
9546   ins_pipe( pipe_slow );
9547 %}
9548 
9549 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9550   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9551   match(Set dst (LShiftVS dst shift));
9552   effect(TEMP src);
9553   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
9554   ins_encode %{
9555     int vector_len = 0;
9556     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9557   %}
9558   ins_pipe( pipe_slow );
9559 %}
9560 
9561 instruct vsll8S(vecX dst, vecS shift) %{
9562   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9563   match(Set dst (LShiftVS dst shift));
9564   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
9565   ins_encode %{
9566     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
9567   %}
9568   ins_pipe( pipe_slow );
9569 %}
9570 
9571 instruct vsll8S_imm(vecX dst, immI8 shift) %{
9572   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9573   match(Set dst (LShiftVS dst shift));
9574   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
9575   ins_encode %{
9576     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
9577   %}
9578   ins_pipe( pipe_slow );
9579 %}
9580 
9581 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9582   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9583   match(Set dst (LShiftVS src shift));
9584   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
9585   ins_encode %{
9586     int vector_len = 0;
9587     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9588   %}
9589   ins_pipe( pipe_slow );
9590 %}
9591 
9592 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9593   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9594   match(Set dst (LShiftVS src shift));
9595   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
9596   ins_encode %{
9597     int vector_len = 0;
9598     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9599   %}
9600   ins_pipe( pipe_slow );
9601 %}
9602 
9603 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9604   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9605   match(Set dst (LShiftVS dst shift));
9606   effect(TEMP src);
9607   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
9608   ins_encode %{
9609     int vector_len = 0;
9610     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9611   %}
9612   ins_pipe( pipe_slow );
9613 %}
9614 
9615 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9616   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9617   match(Set dst (LShiftVS src shift));
9618   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
9619   ins_encode %{
9620     int vector_len = 0;
9621     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9622   %}
9623   ins_pipe( pipe_slow );
9624 %}
9625 
9626 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9627   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9628   match(Set dst (LShiftVS src shift));
9629   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
9630   ins_encode %{
9631     int vector_len = 0;
9632     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9633   %}
9634   ins_pipe( pipe_slow );
9635 %}
9636 
9637 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9638   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9639   match(Set dst (LShiftVS dst shift));
9640   effect(TEMP src);
9641   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
9642   ins_encode %{
9643     int vector_len = 0;
9644     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9645   %}
9646   ins_pipe( pipe_slow );
9647 %}
9648 
9649 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9650   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9651   match(Set dst (LShiftVS src shift));
9652   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9653   ins_encode %{
9654     int vector_len = 1;
9655     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9656   %}
9657   ins_pipe( pipe_slow );
9658 %}
9659 
9660 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9661   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9662   match(Set dst (LShiftVS src shift));
9663   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9664   ins_encode %{
9665     int vector_len = 1;
9666     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9667   %}
9668   ins_pipe( pipe_slow );
9669 %}
9670 
9671 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9672   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9673   match(Set dst (LShiftVS dst shift));
9674   effect(TEMP src);
9675   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9676   ins_encode %{
9677     int vector_len = 1;
9678     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9679   %}
9680   ins_pipe( pipe_slow );
9681 %}
9682 
9683 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9684   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9685   match(Set dst (LShiftVS src shift));
9686   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9687   ins_encode %{
9688     int vector_len = 1;
9689     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9690   %}
9691   ins_pipe( pipe_slow );
9692 %}
9693 
9694 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9695   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9696   match(Set dst (LShiftVS src shift));
9697   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9698   ins_encode %{
9699     int vector_len = 1;
9700     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9701   %}
9702   ins_pipe( pipe_slow );
9703 %}
9704 
9705 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9706   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9707   match(Set dst (LShiftVS dst shift));
9708   effect(TEMP src);
9709   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9710   ins_encode %{
9711     int vector_len = 1;
9712     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9713   %}
9714   ins_pipe( pipe_slow );
9715 %}
9716 
9717 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
9718   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9719   match(Set dst (LShiftVS src shift));
9720   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9721   ins_encode %{
9722     int vector_len = 2;
9723     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9724   %}
9725   ins_pipe( pipe_slow );
9726 %}
9727 
9728 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9729   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9730   match(Set dst (LShiftVS src shift));
9731   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9732   ins_encode %{
9733     int vector_len = 2;
9734     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9735   %}
9736   ins_pipe( pipe_slow );
9737 %}
9738 
9739 // Integers vector left shift
9740 instruct vsll2I(vecD dst, vecS shift) %{
9741   predicate(n->as_Vector()->length() == 2);
9742   match(Set dst (LShiftVI dst shift));
9743   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9744   ins_encode %{
9745     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9746   %}
9747   ins_pipe( pipe_slow );
9748 %}
9749 
9750 instruct vsll2I_imm(vecD dst, immI8 shift) %{
9751   predicate(n->as_Vector()->length() == 2);
9752   match(Set dst (LShiftVI dst shift));
9753   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9754   ins_encode %{
9755     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
9756   %}
9757   ins_pipe( pipe_slow );
9758 %}
9759 
9760 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
9761   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9762   match(Set dst (LShiftVI src shift));
9763   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
9764   ins_encode %{
9765     int vector_len = 0;
9766     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9767   %}
9768   ins_pipe( pipe_slow );
9769 %}
9770 
9771 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9772   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9773   match(Set dst (LShiftVI src shift));
9774   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
9775   ins_encode %{
9776     int vector_len = 0;
9777     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9778   %}
9779   ins_pipe( pipe_slow );
9780 %}
9781 
9782 instruct vsll4I(vecX dst, vecS shift) %{
9783   predicate(n->as_Vector()->length() == 4);
9784   match(Set dst (LShiftVI dst shift));
9785   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
9786   ins_encode %{
9787     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9788   %}
9789   ins_pipe( pipe_slow );
9790 %}
9791 
9792 instruct vsll4I_imm(vecX dst, immI8 shift) %{
9793   predicate(n->as_Vector()->length() == 4);
9794   match(Set dst (LShiftVI dst shift));
9795   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
9796   ins_encode %{
9797     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
9798   %}
9799   ins_pipe( pipe_slow );
9800 %}
9801 
9802 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
9803   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9804   match(Set dst (LShiftVI src shift));
9805   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9806   ins_encode %{
9807     int vector_len = 0;
9808     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9809   %}
9810   ins_pipe( pipe_slow );
9811 %}
9812 
9813 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9814   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9815   match(Set dst (LShiftVI src shift));
9816   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9817   ins_encode %{
9818     int vector_len = 0;
9819     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9820   %}
9821   ins_pipe( pipe_slow );
9822 %}
9823 
9824 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
9825   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9826   match(Set dst (LShiftVI src shift));
9827   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9828   ins_encode %{
9829     int vector_len = 1;
9830     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9831   %}
9832   ins_pipe( pipe_slow );
9833 %}
9834 
9835 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9836   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9837   match(Set dst (LShiftVI src shift));
9838   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9839   ins_encode %{
9840     int vector_len = 1;
9841     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9842   %}
9843   ins_pipe( pipe_slow );
9844 %}
9845 
9846 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
9847   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9848   match(Set dst (LShiftVI src shift));
9849   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9850   ins_encode %{
9851     int vector_len = 2;
9852     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9853   %}
9854   ins_pipe( pipe_slow );
9855 %}
9856 
9857 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9858   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9859   match(Set dst (LShiftVI src shift));
9860   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9861   ins_encode %{
9862     int vector_len = 2;
9863     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9864   %}
9865   ins_pipe( pipe_slow );
9866 %}
9867 
9868 // Longs vector left shift
9869 instruct vsll2L(vecX dst, vecS shift) %{
9870   predicate(n->as_Vector()->length() == 2);
9871   match(Set dst (LShiftVL dst shift));
9872   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9873   ins_encode %{
9874     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
9875   %}
9876   ins_pipe( pipe_slow );
9877 %}
9878 
9879 instruct vsll2L_imm(vecX dst, immI8 shift) %{
9880   predicate(n->as_Vector()->length() == 2);
9881   match(Set dst (LShiftVL dst shift));
9882   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9883   ins_encode %{
9884     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
9885   %}
9886   ins_pipe( pipe_slow );
9887 %}
9888 
9889 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
9890   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9891   match(Set dst (LShiftVL src shift));
9892   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9893   ins_encode %{
9894     int vector_len = 0;
9895     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9896   %}
9897   ins_pipe( pipe_slow );
9898 %}
9899 
9900 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9901   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9902   match(Set dst (LShiftVL src shift));
9903   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9904   ins_encode %{
9905     int vector_len = 0;
9906     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9907   %}
9908   ins_pipe( pipe_slow );
9909 %}
9910 
9911 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
9912   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9913   match(Set dst (LShiftVL src shift));
9914   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9915   ins_encode %{
9916     int vector_len = 1;
9917     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9918   %}
9919   ins_pipe( pipe_slow );
9920 %}
9921 
9922 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9923   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9924   match(Set dst (LShiftVL src shift));
9925   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9926   ins_encode %{
9927     int vector_len = 1;
9928     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9929   %}
9930   ins_pipe( pipe_slow );
9931 %}
9932 
9933 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9934   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9935   match(Set dst (LShiftVL src shift));
9936   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9937   ins_encode %{
9938     int vector_len = 2;
9939     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9940   %}
9941   ins_pipe( pipe_slow );
9942 %}
9943 
9944 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9945   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9946   match(Set dst (LShiftVL src shift));
9947   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9948   ins_encode %{
9949     int vector_len = 2;
9950     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9951   %}
9952   ins_pipe( pipe_slow );
9953 %}
9954 
9955 // ----------------------- LogicalRightShift -----------------------------------
9956 
9957 // Shorts vector logical right shift produces incorrect Java result
9958 // for negative data because java code convert short value into int with
9959 // sign extension before a shift. But char vectors are fine since chars are
9960 // unsigned values.
9961 
9962 instruct vsrl2S(vecS dst, vecS shift) %{
9963   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9964   match(Set dst (URShiftVS dst shift));
9965   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9966   ins_encode %{
9967     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9968   %}
9969   ins_pipe( pipe_slow );
9970 %}
9971 
9972 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9973   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9974   match(Set dst (URShiftVS dst shift));
9975   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9976   ins_encode %{
9977     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9978   %}
9979   ins_pipe( pipe_slow );
9980 %}
9981 
9982 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9983   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9984   match(Set dst (URShiftVS src shift));
9985   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9986   ins_encode %{
9987     int vector_len = 0;
9988     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9989   %}
9990   ins_pipe( pipe_slow );
9991 %}
9992 
9993 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9994   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9995   match(Set dst (URShiftVS src shift));
9996   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9997   ins_encode %{
9998     int vector_len = 0;
9999     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10000   %}
10001   ins_pipe( pipe_slow );
10002 %}
10003 
10004 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
10005   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
10006   match(Set dst (URShiftVS dst shift));
10007   effect(TEMP src);
10008   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
10009   ins_encode %{
10010     int vector_len = 0;
10011     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10012   %}
10013   ins_pipe( pipe_slow );
10014 %}
10015 
10016 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
10017   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
10018   match(Set dst (URShiftVS src shift));
10019   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
10020   ins_encode %{
10021     int vector_len = 0;
10022     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10023   %}
10024   ins_pipe( pipe_slow );
10025 %}
10026 
10027 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
10028   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
10029   match(Set dst (URShiftVS src shift));
10030   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
10031   ins_encode %{
10032     int vector_len = 0;
10033     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10034   %}
10035   ins_pipe( pipe_slow );
10036 %}
10037 
10038 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
10039   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
10040   match(Set dst (URShiftVS dst shift));
10041   effect(TEMP src);
10042   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
10043   ins_encode %{
10044     int vector_len = 0;
10045     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10046   %}
10047   ins_pipe( pipe_slow );
10048 %}
10049 
10050 instruct vsrl4S(vecD dst, vecS shift) %{
10051   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
10052   match(Set dst (URShiftVS dst shift));
10053   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
10054   ins_encode %{
10055     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
10056   %}
10057   ins_pipe( pipe_slow );
10058 %}
10059 
10060 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
10061   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
10062   match(Set dst (URShiftVS dst shift));
10063   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
10064   ins_encode %{
10065     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
10066   %}
10067   ins_pipe( pipe_slow );
10068 %}
10069 
10070 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
10071   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
10072   match(Set dst (URShiftVS src shift));
10073   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
10074   ins_encode %{
10075     int vector_len = 0;
10076     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10077   %}
10078   ins_pipe( pipe_slow );
10079 %}
10080 
10081 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
10082   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10083   match(Set dst (URShiftVS src shift));
10084   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
10085   ins_encode %{
10086     int vector_len = 0;
10087     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10088   %}
10089   ins_pipe( pipe_slow );
10090 %}
10091 
10092 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
10093   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10094   match(Set dst (URShiftVS dst shift));
10095   effect(TEMP src);
10096   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
10097   ins_encode %{
10098     int vector_len = 0;
10099     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10100   %}
10101   ins_pipe( pipe_slow );
10102 %}
10103 
10104 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
10105   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
10106   match(Set dst (URShiftVS src shift));
10107   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
10108   ins_encode %{
10109     int vector_len = 0;
10110     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10111   %}
10112   ins_pipe( pipe_slow );
10113 %}
10114 
10115 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
10116   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10117   match(Set dst (URShiftVS src shift));
10118   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
10119   ins_encode %{
10120     int vector_len = 0;
10121     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10122   %}
10123   ins_pipe( pipe_slow );
10124 %}
10125 
10126 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
10127   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10128   match(Set dst (URShiftVS dst shift));
10129   effect(TEMP src);
10130   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
10131   ins_encode %{
10132     int vector_len = 0;
10133     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10134   %}
10135   ins_pipe( pipe_slow );
10136 %}
10137 
10138 instruct vsrl8S(vecX dst, vecS shift) %{
10139   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10140   match(Set dst (URShiftVS dst shift));
10141   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
10142   ins_encode %{
10143     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
10144   %}
10145   ins_pipe( pipe_slow );
10146 %}
10147 
10148 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
10149   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10150   match(Set dst (URShiftVS dst shift));
10151   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
10152   ins_encode %{
10153     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
10154   %}
10155   ins_pipe( pipe_slow );
10156 %}
10157 
10158 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
10159   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10160   match(Set dst (URShiftVS src shift));
10161   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
10162   ins_encode %{
10163     int vector_len = 0;
10164     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10165   %}
10166   ins_pipe( pipe_slow );
10167 %}
10168 
10169 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
10170   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10171   match(Set dst (URShiftVS src shift));
10172   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
10173   ins_encode %{
10174     int vector_len = 0;
10175     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10176   %}
10177   ins_pipe( pipe_slow );
10178 %}
10179 
10180 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
10181   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10182   match(Set dst (URShiftVS dst shift));
10183   effect(TEMP src);
10184   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
10185   ins_encode %{
10186     int vector_len = 0;
10187     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10188   %}
10189   ins_pipe( pipe_slow );
10190 %}
10191 
10192 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
10193   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10194   match(Set dst (URShiftVS src shift));
10195   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
10196   ins_encode %{
10197     int vector_len = 0;
10198     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10199   %}
10200   ins_pipe( pipe_slow );
10201 %}
10202 
10203 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
10204   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10205   match(Set dst (URShiftVS src shift));
10206   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
10207   ins_encode %{
10208     int vector_len = 0;
10209     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10210   %}
10211   ins_pipe( pipe_slow );
10212 %}
10213 
10214 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
10215   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10216   match(Set dst (URShiftVS dst shift));
10217   effect(TEMP src);
10218   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
10219   ins_encode %{
10220     int vector_len = 0;
10221     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10222   %}
10223   ins_pipe( pipe_slow );
10224 %}
10225 
10226 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
10227   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10228   match(Set dst (URShiftVS src shift));
10229   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
10230   ins_encode %{
10231     int vector_len = 1;
10232     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10233   %}
10234   ins_pipe( pipe_slow );
10235 %}
10236 
10237 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
10238   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10239   match(Set dst (URShiftVS src shift));
10240   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
10241   ins_encode %{
10242     int vector_len = 1;
10243     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10244   %}
10245   ins_pipe( pipe_slow );
10246 %}
10247 
10248 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10249   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10250   match(Set dst (URShiftVS dst shift));
10251   effect(TEMP src);
10252   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
10253   ins_encode %{
10254     int vector_len = 1;
10255     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10256   %}
10257   ins_pipe( pipe_slow );
10258 %}
10259 
10260 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10261   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10262   match(Set dst (URShiftVS src shift));
10263   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
10264   ins_encode %{
10265     int vector_len = 1;
10266     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10267   %}
10268   ins_pipe( pipe_slow );
10269 %}
10270 
10271 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10272   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10273   match(Set dst (URShiftVS src shift));
10274   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
10275   ins_encode %{
10276     int vector_len = 1;
10277     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10278   %}
10279   ins_pipe( pipe_slow );
10280 %}
10281 
10282 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10283   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10284   match(Set dst (URShiftVS dst shift));
10285   effect(TEMP src);
10286   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
10287   ins_encode %{
10288     int vector_len = 1;
10289     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10290   %}
10291   ins_pipe( pipe_slow );
10292 %}
10293 
10294 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
10295   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10296   match(Set dst (URShiftVS src shift));
10297   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
10298   ins_encode %{
10299     int vector_len = 2;
10300     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10301   %}
10302   ins_pipe( pipe_slow );
10303 %}
10304 
10305 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10306   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10307   match(Set dst (URShiftVS src shift));
10308   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
10309   ins_encode %{
10310     int vector_len = 2;
10311     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10312   %}
10313   ins_pipe( pipe_slow );
10314 %}
10315 
10316 // Integers vector logical right shift
10317 instruct vsrl2I(vecD dst, vecS shift) %{
10318   predicate(n->as_Vector()->length() == 2);
10319   match(Set dst (URShiftVI dst shift));
10320   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
10321   ins_encode %{
10322     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
10323   %}
10324   ins_pipe( pipe_slow );
10325 %}
10326 
10327 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
10328   predicate(n->as_Vector()->length() == 2);
10329   match(Set dst (URShiftVI dst shift));
10330   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
10331   ins_encode %{
10332     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
10333   %}
10334   ins_pipe( pipe_slow );
10335 %}
10336 
10337 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
10338   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10339   match(Set dst (URShiftVI src shift));
10340   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
10341   ins_encode %{
10342     int vector_len = 0;
10343     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10344   %}
10345   ins_pipe( pipe_slow );
10346 %}
10347 
10348 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
10349   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10350   match(Set dst (URShiftVI src shift));
10351   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
10352   ins_encode %{
10353     int vector_len = 0;
10354     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10355   %}
10356   ins_pipe( pipe_slow );
10357 %}
10358 
10359 instruct vsrl4I(vecX dst, vecS shift) %{
10360   predicate(n->as_Vector()->length() == 4);
10361   match(Set dst (URShiftVI dst shift));
10362   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
10363   ins_encode %{
10364     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
10365   %}
10366   ins_pipe( pipe_slow );
10367 %}
10368 
10369 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
10370   predicate(n->as_Vector()->length() == 4);
10371   match(Set dst (URShiftVI dst shift));
10372   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
10373   ins_encode %{
10374     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
10375   %}
10376   ins_pipe( pipe_slow );
10377 %}
10378 
10379 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
10380   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10381   match(Set dst (URShiftVI src shift));
10382   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
10383   ins_encode %{
10384     int vector_len = 0;
10385     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10386   %}
10387   ins_pipe( pipe_slow );
10388 %}
10389 
10390 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10391   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10392   match(Set dst (URShiftVI src shift));
10393   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
10394   ins_encode %{
10395     int vector_len = 0;
10396     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10397   %}
10398   ins_pipe( pipe_slow );
10399 %}
10400 
10401 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
10402   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10403   match(Set dst (URShiftVI src shift));
10404   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
10405   ins_encode %{
10406     int vector_len = 1;
10407     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10408   %}
10409   ins_pipe( pipe_slow );
10410 %}
10411 
10412 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10413   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10414   match(Set dst (URShiftVI src shift));
10415   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
10416   ins_encode %{
10417     int vector_len = 1;
10418     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10419   %}
10420   ins_pipe( pipe_slow );
10421 %}
10422 
10423 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
10424   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10425   match(Set dst (URShiftVI src shift));
10426   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
10427   ins_encode %{
10428     int vector_len = 2;
10429     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10430   %}
10431   ins_pipe( pipe_slow );
10432 %}
10433 
10434 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10435   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10436   match(Set dst (URShiftVI src shift));
10437   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
10438   ins_encode %{
10439     int vector_len = 2;
10440     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10441   %}
10442   ins_pipe( pipe_slow );
10443 %}
10444 
10445 // Longs vector logical right shift
10446 instruct vsrl2L(vecX dst, vecS shift) %{
10447   predicate(n->as_Vector()->length() == 2);
10448   match(Set dst (URShiftVL dst shift));
10449   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
10450   ins_encode %{
10451     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
10452   %}
10453   ins_pipe( pipe_slow );
10454 %}
10455 
10456 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
10457   predicate(n->as_Vector()->length() == 2);
10458   match(Set dst (URShiftVL dst shift));
10459   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
10460   ins_encode %{
10461     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
10462   %}
10463   ins_pipe( pipe_slow );
10464 %}
10465 
10466 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
10467   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10468   match(Set dst (URShiftVL src shift));
10469   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
10470   ins_encode %{
10471     int vector_len = 0;
10472     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10473   %}
10474   ins_pipe( pipe_slow );
10475 %}
10476 
10477 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
10478   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10479   match(Set dst (URShiftVL src shift));
10480   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
10481   ins_encode %{
10482     int vector_len = 0;
10483     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10484   %}
10485   ins_pipe( pipe_slow );
10486 %}
10487 
10488 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
10489   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
10490   match(Set dst (URShiftVL src shift));
10491   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
10492   ins_encode %{
10493     int vector_len = 1;
10494     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10495   %}
10496   ins_pipe( pipe_slow );
10497 %}
10498 
10499 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
10500   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
10501   match(Set dst (URShiftVL src shift));
10502   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
10503   ins_encode %{
10504     int vector_len = 1;
10505     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10506   %}
10507   ins_pipe( pipe_slow );
10508 %}
10509 
10510 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
10511   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
10512   match(Set dst (URShiftVL src shift));
10513   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
10514   ins_encode %{
10515     int vector_len = 2;
10516     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10517   %}
10518   ins_pipe( pipe_slow );
10519 %}
10520 
10521 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10522   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
10523   match(Set dst (URShiftVL src shift));
10524   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
10525   ins_encode %{
10526     int vector_len = 2;
10527     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10528   %}
10529   ins_pipe( pipe_slow );
10530 %}
10531 
10532 // ------------------- ArithmeticRightShift -----------------------------------
10533 
10534 // Shorts/Chars vector arithmetic right shift
10535 instruct vsra2S(vecS dst, vecS shift) %{
10536   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
10537   match(Set dst (RShiftVS dst shift));
10538   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
10539   ins_encode %{
10540     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10541   %}
10542   ins_pipe( pipe_slow );
10543 %}
10544 
10545 instruct vsra2S_imm(vecS dst, immI8 shift) %{
10546   predicate(n->as_Vector()->length() == 2);
10547   match(Set dst (RShiftVS dst shift));
10548   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
10549   ins_encode %{
10550     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10551   %}
10552   ins_pipe( pipe_slow );
10553 %}
10554 
10555 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
10556   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
10557   match(Set dst (RShiftVS src shift));
10558   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
10559   ins_encode %{
10560     int vector_len = 0;
10561     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10562   %}
10563   ins_pipe( pipe_slow );
10564 %}
10565 
10566 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
10567   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
10568   match(Set dst (RShiftVS src shift));
10569   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
10570   ins_encode %{
10571     int vector_len = 0;
10572     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10573   %}
10574   ins_pipe( pipe_slow );
10575 %}
10576 
10577 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
10578   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
10579   match(Set dst (RShiftVS dst shift));
10580   effect(TEMP src);
10581   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
10582   ins_encode %{
10583     int vector_len = 0;
10584     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10585   %}
10586   ins_pipe( pipe_slow );
10587 %}
10588 
10589 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
10590   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
10591   match(Set dst (RShiftVS src shift));
10592   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
10593   ins_encode %{
10594     int vector_len = 0;
10595     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10596   %}
10597   ins_pipe( pipe_slow );
10598 %}
10599 
10600 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
10601   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
10602   match(Set dst (RShiftVS src shift));
10603   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
10604   ins_encode %{
10605     int vector_len = 0;
10606     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10607   %}
10608   ins_pipe( pipe_slow );
10609 %}
10610 
10611 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
10612   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
10613   match(Set dst (RShiftVS dst shift));
10614   effect(TEMP src);
10615   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
10616   ins_encode %{
10617     int vector_len = 0;
10618     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10619   %}
10620   ins_pipe( pipe_slow );
10621 %}
10622 
10623 instruct vsra4S(vecD dst, vecS shift) %{
10624   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
10625   match(Set dst (RShiftVS dst shift));
10626   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
10627   ins_encode %{
10628     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10629   %}
10630   ins_pipe( pipe_slow );
10631 %}
10632 
10633 instruct vsra4S_imm(vecD dst, immI8 shift) %{
10634   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
10635   match(Set dst (RShiftVS dst shift));
10636   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
10637   ins_encode %{
10638     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10639   %}
10640   ins_pipe( pipe_slow );
10641 %}
10642 
10643 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
10644   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
10645   match(Set dst (RShiftVS src shift));
10646   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10647   ins_encode %{
10648     int vector_len = 0;
10649     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10650   %}
10651   ins_pipe( pipe_slow );
10652 %}
10653 
10654 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
10655   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10656   match(Set dst (RShiftVS src shift));
10657   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10658   ins_encode %{
10659     int vector_len = 0;
10660     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10661   %}
10662   ins_pipe( pipe_slow );
10663 %}
10664 
10665 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
10666   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10667   match(Set dst (RShiftVS dst shift));
10668   effect(TEMP src);
10669   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10670   ins_encode %{
10671     int vector_len = 0;
10672     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10673   %}
10674   ins_pipe( pipe_slow );
10675 %}
10676 
10677 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
10678   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
10679   match(Set dst (RShiftVS src shift));
10680   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10681   ins_encode %{
10682     int vector_len = 0;
10683     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10684   %}
10685   ins_pipe( pipe_slow );
10686 %}
10687 
10688 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
10689   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10690   match(Set dst (RShiftVS src shift));
10691   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10692   ins_encode %{
10693     int vector_len = 0;
10694     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10695   %}
10696   ins_pipe( pipe_slow );
10697 %}
10698 
10699 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
10700   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10701   match(Set dst (RShiftVS dst shift));
10702   effect(TEMP src);
10703   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10704   ins_encode %{
10705     int vector_len = 0;
10706     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10707   %}
10708   ins_pipe( pipe_slow );
10709 %}
10710 
10711 instruct vsra8S(vecX dst, vecS shift) %{
10712   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10713   match(Set dst (RShiftVS dst shift));
10714   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10715   ins_encode %{
10716     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10717   %}
10718   ins_pipe( pipe_slow );
10719 %}
10720 
10721 instruct vsra8S_imm(vecX dst, immI8 shift) %{
10722   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10723   match(Set dst (RShiftVS dst shift));
10724   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10725   ins_encode %{
10726     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10727   %}
10728   ins_pipe( pipe_slow );
10729 %}
10730 
10731 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
10732   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10733   match(Set dst (RShiftVS src shift));
10734   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10735   ins_encode %{
10736     int vector_len = 0;
10737     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10738   %}
10739   ins_pipe( pipe_slow );
10740 %}
10741 
10742 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
10743   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10744   match(Set dst (RShiftVS src shift));
10745   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10746   ins_encode %{
10747     int vector_len = 0;
10748     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10749   %}
10750   ins_pipe( pipe_slow );
10751 %}
10752 
10753 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
10754   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10755   match(Set dst (RShiftVS dst shift));
10756   effect(TEMP src);
10757   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10758   ins_encode %{
10759     int vector_len = 0;
10760     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10761   %}
10762   ins_pipe( pipe_slow );
10763 %}
10764 
10765 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
10766   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10767   match(Set dst (RShiftVS src shift));
10768   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10769   ins_encode %{
10770     int vector_len = 0;
10771     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10772   %}
10773   ins_pipe( pipe_slow );
10774 %}
10775 
10776 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
10777   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10778   match(Set dst (RShiftVS src shift));
10779   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10780   ins_encode %{
10781     int vector_len = 0;
10782     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10783   %}
10784   ins_pipe( pipe_slow );
10785 %}
10786 
10787 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
10788   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10789   match(Set dst (RShiftVS dst shift));
10790   effect(TEMP src);
10791   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10792   ins_encode %{
10793     int vector_len = 0;
10794     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10795   %}
10796   ins_pipe( pipe_slow );
10797 %}
10798 
10799 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
10800   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10801   match(Set dst (RShiftVS src shift));
10802   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10803   ins_encode %{
10804     int vector_len = 1;
10805     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10806   %}
10807   ins_pipe( pipe_slow );
10808 %}
10809 
10810 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
10811   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10812   match(Set dst (RShiftVS src shift));
10813   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10814   ins_encode %{
10815     int vector_len = 1;
10816     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10817   %}
10818   ins_pipe( pipe_slow );
10819 %}
10820 
10821 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10822   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10823   match(Set dst (RShiftVS dst shift));
10824   effect(TEMP src);
10825   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10826   ins_encode %{
10827     int vector_len = 1;
10828     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10829   %}
10830   ins_pipe( pipe_slow );
10831 %}
10832 
10833 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10834   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
10835   match(Set dst (RShiftVS src shift));
10836   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10837   ins_encode %{
10838     int vector_len = 1;
10839     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10840   %}
10841   ins_pipe( pipe_slow );
10842 %}
10843 
10844 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10845   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10846   match(Set dst (RShiftVS src shift));
10847   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10848   ins_encode %{
10849     int vector_len = 1;
10850     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10851   %}
10852   ins_pipe( pipe_slow );
10853 %}
10854 
10855 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10856   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10857   match(Set dst (RShiftVS dst shift));
10858   effect(TEMP src);
10859   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10860   ins_encode %{
10861     int vector_len = 1;
10862     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10863   %}
10864   ins_pipe( pipe_slow );
10865 %}
10866 
10867 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
10868   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10869   match(Set dst (RShiftVS src shift));
10870   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10871   ins_encode %{
10872     int vector_len = 2;
10873     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10874   %}
10875   ins_pipe( pipe_slow );
10876 %}
10877 
10878 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10879   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10880   match(Set dst (RShiftVS src shift));
10881   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10882   ins_encode %{
10883     int vector_len = 2;
10884     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10885   %}
10886   ins_pipe( pipe_slow );
10887 %}
10888 
10889 // Integers vector arithmetic right shift
10890 instruct vsra2I(vecD dst, vecS shift) %{
10891   predicate(n->as_Vector()->length() == 2);
10892   match(Set dst (RShiftVI dst shift));
10893   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10894   ins_encode %{
10895     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10896   %}
10897   ins_pipe( pipe_slow );
10898 %}
10899 
10900 instruct vsra2I_imm(vecD dst, immI8 shift) %{
10901   predicate(n->as_Vector()->length() == 2);
10902   match(Set dst (RShiftVI dst shift));
10903   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10904   ins_encode %{
10905     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10906   %}
10907   ins_pipe( pipe_slow );
10908 %}
10909 
10910 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
10911   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10912   match(Set dst (RShiftVI src shift));
10913   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10914   ins_encode %{
10915     int vector_len = 0;
10916     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10917   %}
10918   ins_pipe( pipe_slow );
10919 %}
10920 
10921 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
10922   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10923   match(Set dst (RShiftVI src shift));
10924   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10925   ins_encode %{
10926     int vector_len = 0;
10927     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10928   %}
10929   ins_pipe( pipe_slow );
10930 %}
10931 
10932 instruct vsra4I(vecX dst, vecS shift) %{
10933   predicate(n->as_Vector()->length() == 4);
10934   match(Set dst (RShiftVI dst shift));
10935   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10936   ins_encode %{
10937     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10938   %}
10939   ins_pipe( pipe_slow );
10940 %}
10941 
10942 instruct vsra4I_imm(vecX dst, immI8 shift) %{
10943   predicate(n->as_Vector()->length() == 4);
10944   match(Set dst (RShiftVI dst shift));
10945   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10946   ins_encode %{
10947     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10948   %}
10949   ins_pipe( pipe_slow );
10950 %}
10951 
10952 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
10953   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10954   match(Set dst (RShiftVI src shift));
10955   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10956   ins_encode %{
10957     int vector_len = 0;
10958     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10959   %}
10960   ins_pipe( pipe_slow );
10961 %}
10962 
10963 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10964   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10965   match(Set dst (RShiftVI src shift));
10966   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10967   ins_encode %{
10968     int vector_len = 0;
10969     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10970   %}
10971   ins_pipe( pipe_slow );
10972 %}
10973 
10974 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
10975   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10976   match(Set dst (RShiftVI src shift));
10977   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10978   ins_encode %{
10979     int vector_len = 1;
10980     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10981   %}
10982   ins_pipe( pipe_slow );
10983 %}
10984 
10985 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10986   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10987   match(Set dst (RShiftVI src shift));
10988   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10989   ins_encode %{
10990     int vector_len = 1;
10991     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10992   %}
10993   ins_pipe( pipe_slow );
10994 %}
10995 
10996 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
10997   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10998   match(Set dst (RShiftVI src shift));
10999   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
11000   ins_encode %{
11001     int vector_len = 2;
11002     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
11003   %}
11004   ins_pipe( pipe_slow );
11005 %}
11006 
11007 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
11008   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
11009   match(Set dst (RShiftVI src shift));
11010   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
11011   ins_encode %{
11012     int vector_len = 2;
11013     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
11014   %}
11015   ins_pipe( pipe_slow );
11016 %}
11017 
11018 // There are no longs vector arithmetic right shift instructions.
11019 
11020 
11021 // --------------------------------- AND --------------------------------------
11022 
11023 instruct vand4B(vecS dst, vecS src) %{
11024   predicate(n->as_Vector()->length_in_bytes() == 4);
11025   match(Set dst (AndV dst src));
11026   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
11027   ins_encode %{
11028     __ pand($dst$$XMMRegister, $src$$XMMRegister);
11029   %}
11030   ins_pipe( pipe_slow );
11031 %}
11032 
11033 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
11034   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11035   match(Set dst (AndV src1 src2));
11036   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
11037   ins_encode %{
11038     int vector_len = 0;
11039     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11040   %}
11041   ins_pipe( pipe_slow );
11042 %}
11043 
11044 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
11045   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11046   match(Set dst (AndV src (LoadVector mem)));
11047   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
11048   ins_encode %{
11049     int vector_len = 0;
11050     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11051   %}
11052   ins_pipe( pipe_slow );
11053 %}
11054 
11055 instruct vand8B(vecD dst, vecD src) %{
11056   predicate(n->as_Vector()->length_in_bytes() == 8);
11057   match(Set dst (AndV dst src));
11058   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
11059   ins_encode %{
11060     __ pand($dst$$XMMRegister, $src$$XMMRegister);
11061   %}
11062   ins_pipe( pipe_slow );
11063 %}
11064 
11065 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
11066   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
11067   match(Set dst (AndV src1 src2));
11068   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
11069   ins_encode %{
11070     int vector_len = 0;
11071     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11072   %}
11073   ins_pipe( pipe_slow );
11074 %}
11075 
11076 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
11077   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
11078   match(Set dst (AndV src (LoadVector mem)));
11079   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
11080   ins_encode %{
11081     int vector_len = 0;
11082     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11083   %}
11084   ins_pipe( pipe_slow );
11085 %}
11086 
11087 instruct vand16B(vecX dst, vecX src) %{
11088   predicate(n->as_Vector()->length_in_bytes() == 16);
11089   match(Set dst (AndV dst src));
11090   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
11091   ins_encode %{
11092     __ pand($dst$$XMMRegister, $src$$XMMRegister);
11093   %}
11094   ins_pipe( pipe_slow );
11095 %}
11096 
11097 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
11098   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11099   match(Set dst (AndV src1 src2));
11100   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
11101   ins_encode %{
11102     int vector_len = 0;
11103     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11104   %}
11105   ins_pipe( pipe_slow );
11106 %}
11107 
11108 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
11109   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11110   match(Set dst (AndV src (LoadVector mem)));
11111   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
11112   ins_encode %{
11113     int vector_len = 0;
11114     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11115   %}
11116   ins_pipe( pipe_slow );
11117 %}
11118 
11119 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
11120   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
11121   match(Set dst (AndV src1 src2));
11122   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
11123   ins_encode %{
11124     int vector_len = 1;
11125     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11126   %}
11127   ins_pipe( pipe_slow );
11128 %}
11129 
11130 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
11131   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
11132   match(Set dst (AndV src (LoadVector mem)));
11133   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
11134   ins_encode %{
11135     int vector_len = 1;
11136     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11137   %}
11138   ins_pipe( pipe_slow );
11139 %}
11140 
11141 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
11142   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11143   match(Set dst (AndV src1 src2));
11144   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
11145   ins_encode %{
11146     int vector_len = 2;
11147     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11148   %}
11149   ins_pipe( pipe_slow );
11150 %}
11151 
11152 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
11153   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11154   match(Set dst (AndV src (LoadVector mem)));
11155   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
11156   ins_encode %{
11157     int vector_len = 2;
11158     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11159   %}
11160   ins_pipe( pipe_slow );
11161 %}
11162 
11163 // --------------------------------- OR ---------------------------------------
11164 
11165 instruct vor4B(vecS dst, vecS src) %{
11166   predicate(n->as_Vector()->length_in_bytes() == 4);
11167   match(Set dst (OrV dst src));
11168   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
11169   ins_encode %{
11170     __ por($dst$$XMMRegister, $src$$XMMRegister);
11171   %}
11172   ins_pipe( pipe_slow );
11173 %}
11174 
11175 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
11176   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11177   match(Set dst (OrV src1 src2));
11178   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
11179   ins_encode %{
11180     int vector_len = 0;
11181     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11182   %}
11183   ins_pipe( pipe_slow );
11184 %}
11185 
11186 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
11187   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11188   match(Set dst (OrV src (LoadVector mem)));
11189   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
11190   ins_encode %{
11191     int vector_len = 0;
11192     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11193   %}
11194   ins_pipe( pipe_slow );
11195 %}
11196 
11197 instruct vor8B(vecD dst, vecD src) %{
11198   predicate(n->as_Vector()->length_in_bytes() == 8);
11199   match(Set dst (OrV dst src));
11200   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
11201   ins_encode %{
11202     __ por($dst$$XMMRegister, $src$$XMMRegister);
11203   %}
11204   ins_pipe( pipe_slow );
11205 %}
11206 
11207 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
11208   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
11209   match(Set dst (OrV src1 src2));
11210   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
11211   ins_encode %{
11212     int vector_len = 0;
11213     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11214   %}
11215   ins_pipe( pipe_slow );
11216 %}
11217 
11218 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
11219   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11220   match(Set dst (OrV src (LoadVector mem)));
11221   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
11222   ins_encode %{
11223     int vector_len = 0;
11224     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11225   %}
11226   ins_pipe( pipe_slow );
11227 %}
11228 
11229 instruct vor16B(vecX dst, vecX src) %{
11230   predicate(n->as_Vector()->length_in_bytes() == 16);
11231   match(Set dst (OrV dst src));
11232   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
11233   ins_encode %{
11234     __ por($dst$$XMMRegister, $src$$XMMRegister);
11235   %}
11236   ins_pipe( pipe_slow );
11237 %}
11238 
11239 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
11240   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11241   match(Set dst (OrV src1 src2));
11242   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
11243   ins_encode %{
11244     int vector_len = 0;
11245     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11246   %}
11247   ins_pipe( pipe_slow );
11248 %}
11249 
11250 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
11251   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11252   match(Set dst (OrV src (LoadVector mem)));
11253   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
11254   ins_encode %{
11255     int vector_len = 0;
11256     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11257   %}
11258   ins_pipe( pipe_slow );
11259 %}
11260 
11261 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
11262   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
11263   match(Set dst (OrV src1 src2));
11264   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
11265   ins_encode %{
11266     int vector_len = 1;
11267     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11268   %}
11269   ins_pipe( pipe_slow );
11270 %}
11271 
11272 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
11273   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
11274   match(Set dst (OrV src (LoadVector mem)));
11275   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
11276   ins_encode %{
11277     int vector_len = 1;
11278     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11279   %}
11280   ins_pipe( pipe_slow );
11281 %}
11282 
11283 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
11284   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11285   match(Set dst (OrV src1 src2));
11286   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
11287   ins_encode %{
11288     int vector_len = 2;
11289     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11290   %}
11291   ins_pipe( pipe_slow );
11292 %}
11293 
11294 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
11295   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11296   match(Set dst (OrV src (LoadVector mem)));
11297   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
11298   ins_encode %{
11299     int vector_len = 2;
11300     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11301   %}
11302   ins_pipe( pipe_slow );
11303 %}
11304 
11305 // --------------------------------- XOR --------------------------------------
11306 
11307 instruct vxor4B(vecS dst, vecS src) %{
11308   predicate(n->as_Vector()->length_in_bytes() == 4);
11309   match(Set dst (XorV dst src));
11310   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
11311   ins_encode %{
11312     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
11313   %}
11314   ins_pipe( pipe_slow );
11315 %}
11316 
11317 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
11318   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11319   match(Set dst (XorV src1 src2));
11320   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
11321   ins_encode %{
11322     int vector_len = 0;
11323     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11324   %}
11325   ins_pipe( pipe_slow );
11326 %}
11327 
11328 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
11329   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
11330   match(Set dst (XorV src (LoadVector mem)));
11331   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
11332   ins_encode %{
11333     int vector_len = 0;
11334     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11335   %}
11336   ins_pipe( pipe_slow );
11337 %}
11338 
11339 instruct vxor8B(vecD dst, vecD src) %{
11340   predicate(n->as_Vector()->length_in_bytes() == 8);
11341   match(Set dst (XorV dst src));
11342   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
11343   ins_encode %{
11344     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
11345   %}
11346   ins_pipe( pipe_slow );
11347 %}
11348 
11349 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
11350   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
11351   match(Set dst (XorV src1 src2));
11352   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
11353   ins_encode %{
11354     int vector_len = 0;
11355     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11356   %}
11357   ins_pipe( pipe_slow );
11358 %}
11359 
11360 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
11361   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
11362   match(Set dst (XorV src (LoadVector mem)));
11363   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
11364   ins_encode %{
11365     int vector_len = 0;
11366     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11367   %}
11368   ins_pipe( pipe_slow );
11369 %}
11370 
11371 instruct vxor16B(vecX dst, vecX src) %{
11372   predicate(n->as_Vector()->length_in_bytes() == 16);
11373   match(Set dst (XorV dst src));
11374   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
11375   ins_encode %{
11376     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
11377   %}
11378   ins_pipe( pipe_slow );
11379 %}
11380 
11381 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
11382   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11383   match(Set dst (XorV src1 src2));
11384   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
11385   ins_encode %{
11386     int vector_len = 0;
11387     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11388   %}
11389   ins_pipe( pipe_slow );
11390 %}
11391 
11392 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
11393   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11394   match(Set dst (XorV src (LoadVector mem)));
11395   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
11396   ins_encode %{
11397     int vector_len = 0;
11398     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11399   %}
11400   ins_pipe( pipe_slow );
11401 %}
11402 
11403 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
11404   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
11405   match(Set dst (XorV src1 src2));
11406   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
11407   ins_encode %{
11408     int vector_len = 1;
11409     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11410   %}
11411   ins_pipe( pipe_slow );
11412 %}
11413 
11414 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
11415   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
11416   match(Set dst (XorV src (LoadVector mem)));
11417   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
11418   ins_encode %{
11419     int vector_len = 1;
11420     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11421   %}
11422   ins_pipe( pipe_slow );
11423 %}
11424 
11425 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
11426   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11427   match(Set dst (XorV src1 src2));
11428   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
11429   ins_encode %{
11430     int vector_len = 2;
11431     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11432   %}
11433   ins_pipe( pipe_slow );
11434 %}
11435 
11436 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
11437   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11438   match(Set dst (XorV src (LoadVector mem)));
11439   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
11440   ins_encode %{
11441     int vector_len = 2;
11442     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11443   %}
11444   ins_pipe( pipe_slow );
11445 %}
11446 
11447 instruct vcvt2Fto2D_reg(vecX dst, vecD src) %{
11448   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
11449   match(Set dst (ConvertVF2VD src));
11450   format %{ "vcvtps2pd   $dst,$src\t! convert 2F to 2D vector" %}
11451   ins_encode %{
11452     int vector_len = 0;
11453     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
11454   %}
11455   ins_pipe( pipe_slow );
11456 %}
11457 
11458 instruct vcvt4Fto4D_reg(vecY dst, vecX src) %{
11459   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 32);
11460   match(Set dst (ConvertVF2VD src));
11461   format %{ "vcvtps2pd   $dst,$src\t! convert 4F to 4D vector" %}
11462   ins_encode %{
11463     int vector_len = 1;
11464     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
11465   %}
11466   ins_pipe( pipe_slow );
11467 %}
11468 
11469 instruct vcvt8Fto4D_reg(vecY dst, vecY src) %{
11470   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 32);
11471   match(Set dst (ConvertVF2VD src));
11472   format %{ "vcvtps2pd   $dst,$src\t! convert 8F to 4D vector" %}
11473   ins_encode %{
11474     int vector_len = 1;
11475     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
11476   %}
11477   ins_pipe( pipe_slow );
11478 %}
11479 
11480 instruct vcvt8Fto8D_reg(vecZ dst, vecY src) %{
11481   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
11482   match(Set dst (ConvertVF2VD src));
11483   format %{ "evcvtps2pd   $dst,$src\t! convert 8F to 8D vector" %}
11484   ins_encode %{
11485     int vector_len = 2;
11486     __ evcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
11487   %}
11488   ins_pipe( pipe_slow );
11489 %}
11490 
11491 instruct vcmpeq2F(vecD dst, vecD src1, vecD src2) %{
11492   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11493             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11494             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11495   match(Set dst (VectorMaskCmp src1 src2));
11496   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed2F" %}
11497   ins_encode %{
11498     int vector_len = 0;
11499     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11500     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11501   %}
11502   ins_pipe( pipe_slow );
11503 %}
11504 
11505 instruct vcmpeq4F(vecX dst, vecX src1, vecX src2) %{
11506   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11507             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11508             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11509   match(Set dst (VectorMaskCmp src1 src2));
11510   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed4F" %}
11511   ins_encode %{
11512     int vector_len = 0;
11513     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11514     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11515   %}
11516   ins_pipe( pipe_slow );
11517 %}
11518 
11519 instruct vcmpeq8F(vecY dst, vecY src1, vecY src2) %{
11520   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
11521             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11522             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11523   match(Set dst (VectorMaskCmp src1 src2));
11524   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed8F" %}
11525   ins_encode %{
11526     int vector_len = 1;
11527     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11528     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11529   %}
11530   ins_pipe( pipe_slow );
11531 %}
11532 
11533 instruct vcmpeq16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11534   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
11535             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11536             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11537   match(Set dst (VectorMaskCmp src1 src2));
11538   effect(TEMP dst, TEMP scratch);
11539   format %{ "vcmpeqps  k2,$src1,$src2\n\t"
11540             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16F" %}
11541   ins_encode %{
11542     int vector_len = 2;
11543     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11544     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11545     KRegister mask = k0; // The comparison itself is not being masked.
11546     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11547     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11548   %}
11549   ins_pipe( pipe_slow );
11550 %}
11551 
11552 instruct vcmplt2F(vecD dst, vecD src1, vecD src2) %{
11553   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11554             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11555             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11556   match(Set dst (VectorMaskCmp src1 src2));
11557   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed2F" %}
11558   ins_encode %{
11559     int vector_len = 0;
11560     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11561     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11562   %}
11563   ins_pipe( pipe_slow );
11564 %}
11565 
11566 instruct vcmplt4F(vecX dst, vecX src1, vecX src2) %{
11567   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11568             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11569             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11570   match(Set dst (VectorMaskCmp src1 src2));
11571   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed4F" %}
11572   ins_encode %{
11573     int vector_len = 0;
11574     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11575     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11576   %}
11577   ins_pipe( pipe_slow );
11578 %}
11579 
11580 instruct vcmplt8F(vecY dst, vecY src1, vecY src2) %{
11581   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
11582             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11583             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11584   match(Set dst (VectorMaskCmp src1 src2));
11585   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed8F" %}
11586   ins_encode %{
11587     int vector_len = 1;
11588     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11589     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11590   %}
11591   ins_pipe( pipe_slow );
11592 %}
11593 
11594 instruct vcmplt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11595   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
11596             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11597             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11598   match(Set dst (VectorMaskCmp src1 src2));
11599   effect(TEMP dst, TEMP scratch);
11600   format %{ "vcmpltps  k2,$src1,$src2\n\t"
11601             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed16F" %}
11602   ins_encode %{
11603     int vector_len = 2;
11604     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11605     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11606     KRegister mask = k0; // The comparison itself is not being masked.
11607     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11608     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11609   %}
11610   ins_pipe( pipe_slow );
11611 %}
11612 
11613 instruct vcmpgt2F(vecD dst, vecD src1, vecD src2) %{
11614   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11615             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
11616             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11617   match(Set dst (VectorMaskCmp src1 src2));
11618   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed2F" %}
11619   ins_encode %{
11620     int vector_len = 0;
11621     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
11622     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11623   %}
11624   ins_pipe( pipe_slow );
11625 %}
11626 
11627 instruct vcmpgt4F(vecX dst, vecX src1, vecX src2) %{
11628   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11629             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
11630             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11631   match(Set dst (VectorMaskCmp src1 src2));
11632   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed4F" %}
11633   ins_encode %{
11634     int vector_len = 0;
11635     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
11636     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11637   %}
11638   ins_pipe( pipe_slow );
11639 %}
11640 
11641 instruct vcmpgt8F(vecY dst, vecY src1, vecY src2) %{
11642   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
11643             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
11644             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11645   match(Set dst (VectorMaskCmp src1 src2));
11646   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed8F" %}
11647   ins_encode %{
11648     int vector_len = 1;
11649     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
11650     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11651   %}
11652   ins_pipe( pipe_slow );
11653 %}
11654 
11655 instruct vcmpgt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11656   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
11657             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
11658             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11659   match(Set dst (VectorMaskCmp src1 src2));
11660   effect(TEMP dst, TEMP scratch);
11661   format %{ "vcmpgtps  k2,$src1,$src2\n\t"
11662             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16F" %}
11663   ins_encode %{
11664     int vector_len = 2;
11665     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
11666     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11667     KRegister mask = k0; // The comparison itself is not being masked.
11668     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11669     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11670   %}
11671   ins_pipe( pipe_slow );
11672 %}
11673 
11674 instruct vcmpge2F(vecD dst, vecD src1, vecD src2) %{
11675   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11676             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
11677             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11678   match(Set dst (VectorMaskCmp src1 src2));
11679   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed2F" %}
11680   ins_encode %{
11681     int vector_len = 0;
11682     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
11683     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11684   %}
11685   ins_pipe( pipe_slow );
11686 %}
11687 
11688 instruct vcmpge4F(vecX dst, vecX src1, vecX src2) %{
11689   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11690             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
11691             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11692   match(Set dst (VectorMaskCmp src1 src2));
11693   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed4F" %}
11694   ins_encode %{
11695     int vector_len = 0;
11696     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
11697     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11698   %}
11699   ins_pipe( pipe_slow );
11700 %}
11701 
11702 instruct vcmpge8F(vecY dst, vecY src1, vecY src2) %{
11703   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
11704             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
11705             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11706   match(Set dst (VectorMaskCmp src1 src2));
11707   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed8F" %}
11708   ins_encode %{
11709     int vector_len = 1;
11710     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
11711     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11712   %}
11713   ins_pipe( pipe_slow );
11714 %}
11715 
11716 instruct vcmpge16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11717   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
11718             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
11719             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11720   match(Set dst (VectorMaskCmp src1 src2));
11721   effect(TEMP dst, TEMP scratch);
11722   format %{ "vcmpgeps  k2,$src1,$src2\n\t"
11723             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16F" %}
11724   ins_encode %{
11725     int vector_len = 2;
11726     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
11727     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11728     KRegister mask = k0; // The comparison itself is not being masked.
11729     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11730     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11731   %}
11732   ins_pipe( pipe_slow );
11733 %}
11734 
11735 instruct vcmple2F(vecD dst, vecD src1, vecD src2) %{
11736   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11737             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
11738             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11739   match(Set dst (VectorMaskCmp src1 src2));
11740   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed2F" %}
11741   ins_encode %{
11742     int vector_len = 0;
11743     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
11744     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11745   %}
11746   ins_pipe( pipe_slow );
11747 %}
11748 
11749 instruct vcmple4F(vecX dst, vecX src1, vecX src2) %{
11750   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11751             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
11752             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11753   match(Set dst (VectorMaskCmp src1 src2));
11754   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed4F" %}
11755   ins_encode %{
11756     int vector_len = 0;
11757     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
11758     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11759   %}
11760   ins_pipe( pipe_slow );
11761 %}
11762 
11763 instruct vcmple8F(vecY dst, vecY src1, vecY src2) %{
11764   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
11765             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
11766             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11767   match(Set dst (VectorMaskCmp src1 src2));
11768   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed8F" %}
11769   ins_encode %{
11770     int vector_len = 1;
11771     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
11772     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11773   %}
11774   ins_pipe( pipe_slow );
11775 %}
11776 
11777 instruct vcmple16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11778   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
11779             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
11780             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11781   match(Set dst (VectorMaskCmp src1 src2));
11782   effect(TEMP dst, TEMP scratch);
11783   format %{ "vcmpleps  k2,$src1,$src2\n\t"
11784             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16F" %}
11785   ins_encode %{
11786     int vector_len = 2;
11787     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
11788     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11789     KRegister mask = k0; // The comparison itself is not being masked.
11790     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11791     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11792   %}
11793   ins_pipe( pipe_slow );
11794 %}
11795 
11796 instruct vcmpne2F(vecD dst, vecD src1, vecD src2) %{
11797   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11798             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
11799             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11800   match(Set dst (VectorMaskCmp src1 src2));
11801   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed2F" %}
11802   ins_encode %{
11803     int vector_len = 0;
11804     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
11805     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
11806     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11807   %}
11808   ins_pipe( pipe_slow );
11809 %}
11810 
11811 instruct vcmpne4F(vecX dst, vecX src1, vecX src2) %{
11812   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11813             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
11814             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11815   match(Set dst (VectorMaskCmp src1 src2));
11816   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed4F" %}
11817   ins_encode %{
11818     int vector_len = 0;
11819     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
11820     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
11821     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11822   %}
11823   ins_pipe( pipe_slow );
11824 %}
11825 
11826 instruct vcmpne8F(vecY dst, vecY src1, vecY src2) %{
11827   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
11828             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
11829             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11830   match(Set dst (VectorMaskCmp src1 src2));
11831   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed8F" %}
11832   ins_encode %{
11833     int vector_len = 1;
11834     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
11835     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
11836     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11837   %}
11838   ins_pipe( pipe_slow );
11839 %}
11840 
11841 instruct vcmpne16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11842   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
11843             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
11844             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
11845   match(Set dst (VectorMaskCmp src1 src2));
11846   effect(TEMP dst, TEMP scratch);
11847   format %{ "vcmpneps  k2,$src1,$src2\n\t"
11848             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed16F" %}
11849   ins_encode %{
11850     int vector_len = 2;
11851     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
11852     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
11853     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11854     KRegister mask = k0; // The comparison itself is not being masked.
11855     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11856     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11857   %}
11858   ins_pipe( pipe_slow );
11859 %}
11860 
11861 instruct vcmpeq1D(vecD dst, vecD src1, vecD src2) %{
11862   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
11863             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11864             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11865   match(Set dst (VectorMaskCmp src1 src2));
11866   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed1D" %}
11867   ins_encode %{
11868     int vector_len = 0;
11869     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11870     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11871   %}
11872   ins_pipe( pipe_slow );
11873 %}
11874 
11875 instruct vcmpeq2D(vecX dst, vecX src1, vecX src2) %{
11876   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11877             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11878             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11879   match(Set dst (VectorMaskCmp src1 src2));
11880   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed2D" %}
11881   ins_encode %{
11882     int vector_len = 0;
11883     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11884     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11885   %}
11886   ins_pipe( pipe_slow );
11887 %}
11888 
11889 instruct vcmpeq4D(vecY dst, vecY src1, vecY src2) %{
11890   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11891             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11892             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11893   match(Set dst (VectorMaskCmp src1 src2));
11894   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed4D" %}
11895   ins_encode %{
11896     int vector_len = 1;
11897     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11898     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11899   %}
11900   ins_pipe( pipe_slow );
11901 %}
11902 
11903 instruct vcmpeq8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11904   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
11905             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
11906             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11907   match(Set dst (VectorMaskCmp src1 src2));
11908   effect(TEMP dst, TEMP scratch);
11909   format %{ "vcmpeqpd  k2,$src1,$src2\n\t"
11910             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8D" %}
11911   ins_encode %{
11912     int vector_len = 2;
11913     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
11914     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11915     KRegister mask = k0; // The comparison itself is not being masked.
11916     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11917     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11918   %}
11919   ins_pipe( pipe_slow );
11920 %}
11921 
11922 instruct vcmplt1D(vecD dst, vecD src1, vecD src2) %{
11923   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
11924             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11925             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11926   match(Set dst (VectorMaskCmp src1 src2));
11927   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed1D" %}
11928   ins_encode %{
11929     int vector_len = 0;
11930     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11931     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11932   %}
11933   ins_pipe( pipe_slow );
11934 %}
11935 
11936 instruct vcmplt2D(vecX dst, vecX src1, vecX src2) %{
11937   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11938             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11939             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11940   match(Set dst (VectorMaskCmp src1 src2));
11941   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed2D" %}
11942   ins_encode %{
11943     int vector_len = 0;
11944     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11945     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11946   %}
11947   ins_pipe( pipe_slow );
11948 %}
11949 
11950 instruct vcmplt4D(vecY dst, vecY src1, vecY src2) %{
11951   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
11952             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11953             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11954   match(Set dst (VectorMaskCmp src1 src2));
11955   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed4D" %}
11956   ins_encode %{
11957     int vector_len = 1;
11958     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11959     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11960   %}
11961   ins_pipe( pipe_slow );
11962 %}
11963 
11964 instruct vcmplt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
11965   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
11966             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
11967             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11968   match(Set dst (VectorMaskCmp src1 src2));
11969   effect(TEMP dst, TEMP scratch);
11970   format %{ "vcmpltpd  k2,$src1,$src2\n\t"
11971             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed8D" %}
11972   ins_encode %{
11973     int vector_len = 2;
11974     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
11975     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
11976     KRegister mask = k0; // The comparison itself is not being masked.
11977     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11978     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
11979   %}
11980   ins_pipe( pipe_slow );
11981 %}
11982 
11983 instruct vcmpgt1D(vecD dst, vecD src1, vecD src2) %{
11984   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
11985             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
11986             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
11987   match(Set dst (VectorMaskCmp src1 src2));
11988   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed1D" %}
11989   ins_encode %{
11990     int vector_len = 0;
11991     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
11992     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
11993   %}
11994   ins_pipe( pipe_slow );
11995 %}
11996 
11997 instruct vcmpgt2D(vecX dst, vecX src1, vecX src2) %{
11998   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
11999             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12000             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12001   match(Set dst (VectorMaskCmp src1 src2));
12002   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed2D" %}
12003   ins_encode %{
12004     int vector_len = 0;
12005     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
12006     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12007   %}
12008   ins_pipe( pipe_slow );
12009 %}
12010 
12011 instruct vcmpgt4D(vecY dst, vecY src1, vecY src2) %{
12012   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12013             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12014             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12015   match(Set dst (VectorMaskCmp src1 src2));
12016   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed4D" %}
12017   ins_encode %{
12018     int vector_len = 1;
12019     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
12020     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12021   %}
12022   ins_pipe( pipe_slow );
12023 %}
12024 
12025 instruct vcmpgt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12026   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
12027             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12028             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12029   match(Set dst (VectorMaskCmp src1 src2));
12030   effect(TEMP dst, TEMP scratch);
12031   format %{ "vcmpgtpd  k2,$src1,$src2\n\t"
12032             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8D" %}
12033   ins_encode %{
12034     int vector_len = 2;
12035     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
12036     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12037     KRegister mask = k0; // The comparison itself is not being masked.
12038     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12039     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12040   %}
12041   ins_pipe( pipe_slow );
12042 %}
12043 
12044 instruct vcmpge1D(vecD dst, vecD src1, vecD src2) %{
12045   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
12046             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12047             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12048   match(Set dst (VectorMaskCmp src1 src2));
12049   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed1D" %}
12050   ins_encode %{
12051     int vector_len = 0;
12052     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
12053     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12054   %}
12055   ins_pipe( pipe_slow );
12056 %}
12057 
12058 instruct vcmpge2D(vecX dst, vecX src1, vecX src2) %{
12059   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12060             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12061             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12062   match(Set dst (VectorMaskCmp src1 src2));
12063   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed2D" %}
12064   ins_encode %{
12065     int vector_len = 0;
12066     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
12067     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12068   %}
12069   ins_pipe( pipe_slow );
12070 %}
12071 
12072 instruct vcmpge4D(vecY dst, vecY src1, vecY src2) %{
12073   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12074             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12075             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12076   match(Set dst (VectorMaskCmp src1 src2));
12077   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed4D" %}
12078   ins_encode %{
12079     int vector_len = 1;
12080     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
12081     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12082   %}
12083   ins_pipe( pipe_slow );
12084 %}
12085 
12086 instruct vcmpge8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12087   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
12088             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12089             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12090   match(Set dst (VectorMaskCmp src1 src2));
12091   effect(TEMP dst, TEMP scratch);
12092   format %{ "vcmpgepd  k2,$src1,$src2\n\t"
12093             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed8D" %}
12094   ins_encode %{
12095     int vector_len = 2;
12096     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
12097     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12098     KRegister mask = k0; // The comparison itself is not being masked.
12099     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12100     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12101   %}
12102   ins_pipe( pipe_slow );
12103 %}
12104 
12105 instruct vcmple1D(vecD dst, vecD src1, vecD src2) %{
12106   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
12107             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12108             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12109   match(Set dst (VectorMaskCmp src1 src2));
12110   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed1D" %}
12111   ins_encode %{
12112     int vector_len = 0;
12113     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
12114     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12115   %}
12116   ins_pipe( pipe_slow );
12117 %}
12118 
12119 instruct vcmple2D(vecX dst, vecX src1, vecX src2) %{
12120   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12121             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12122             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12123   match(Set dst (VectorMaskCmp src1 src2));
12124   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed2D" %}
12125   ins_encode %{
12126     int vector_len = 0;
12127     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
12128     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12129   %}
12130   ins_pipe( pipe_slow );
12131 %}
12132 
12133 instruct vcmple4D(vecY dst, vecY src1, vecY src2) %{
12134   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12135             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12136             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12137   match(Set dst (VectorMaskCmp src1 src2));
12138   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed4D" %}
12139   ins_encode %{
12140     int vector_len = 1;
12141     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
12142     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12143   %}
12144   ins_pipe( pipe_slow );
12145 %}
12146 
12147 instruct vcmple8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12148   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
12149             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12150             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12151   match(Set dst (VectorMaskCmp src1 src2));
12152   effect(TEMP dst, TEMP scratch);
12153   format %{ "vcmplepd  k2,$src1,$src2\n\t"
12154             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed8D" %}
12155   ins_encode %{
12156     int vector_len = 2;
12157     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
12158     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12159     KRegister mask = k0; // The comparison itself is not being masked.
12160     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12161     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12162   %}
12163   ins_pipe( pipe_slow );
12164 %}
12165 
12166 instruct vcmpne1D(vecD dst, vecD src1, vecD src2) %{
12167   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
12168             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12169             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12170   match(Set dst (VectorMaskCmp src1 src2));
12171   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed1D" %}
12172   ins_encode %{
12173     int vector_len = 0;
12174     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
12175     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
12176     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12177   %}
12178   ins_pipe( pipe_slow );
12179 %}
12180 
12181 instruct vcmpne2D(vecX dst, vecX src1, vecX src2) %{
12182   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12183             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12184             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12185   match(Set dst (VectorMaskCmp src1 src2));
12186   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed2D" %}
12187   ins_encode %{
12188     int vector_len = 0;
12189     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
12190     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
12191     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12192   %}
12193   ins_pipe( pipe_slow );
12194 %}
12195 
12196 instruct vcmpne4D(vecY dst, vecY src1, vecY src2) %{
12197   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12198             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12199             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12200   match(Set dst (VectorMaskCmp src1 src2));
12201   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed4D" %}
12202   ins_encode %{
12203     int vector_len = 1;
12204     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
12205     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
12206     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12207   %}
12208   ins_pipe( pipe_slow );
12209 %}
12210 
12211 instruct vcmpne8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12212   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
12213             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12214             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
12215   match(Set dst (VectorMaskCmp src1 src2));
12216   effect(TEMP dst, TEMP scratch);
12217   format %{ "vcmpnepd  k2,$src1,$src2\n\t"
12218             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed8D" %}
12219   ins_encode %{
12220     int vector_len = 2;
12221     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
12222     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
12223     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12224     KRegister mask = k0; // The comparison itself is not being masked.
12225     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12226     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12227   %}
12228   ins_pipe( pipe_slow );
12229 %}
12230 
12231 instruct vcmpeq2I(vecD dst, vecD src1, vecD src2) %{
12232   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12233             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12234             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12235   match(Set dst (VectorMaskCmp src1 src2));
12236   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed2I" %}
12237   ins_encode %{
12238     int vector_len = 0;
12239     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12240   %}
12241   ins_pipe( pipe_slow );
12242 %}
12243 
12244 instruct vcmpeq4I(vecX dst, vecX src1, vecX src2) %{
12245   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12246             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12247             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12248   match(Set dst (VectorMaskCmp src1 src2));
12249   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed4I" %}
12250   ins_encode %{
12251     int vector_len = 0;
12252     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12253   %}
12254   ins_pipe( pipe_slow );
12255 %}
12256 
12257 instruct vcmpeq8I(vecY dst, vecY src1, vecY src2) %{
12258   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
12259             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12260             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12261   match(Set dst (VectorMaskCmp src1 src2));
12262   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed8I" %}
12263   ins_encode %{
12264     int vector_len = 1;
12265     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12266   %}
12267   ins_pipe( pipe_slow );
12268 %}
12269 
12270 instruct vcmpeq16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12271   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
12272             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12273             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12274   match(Set dst (VectorMaskCmp src1 src2));
12275   effect(TEMP dst, TEMP scratch);
12276   format %{ "vpcmpeqd  k2,$src1,$src2\n\t"
12277             "vmovdqu32 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16I" %}
12278   ins_encode %{
12279     int vector_len = 2;
12280     Assembler::ComparisonPredicate cmp = Assembler::eq;
12281     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12282     KRegister mask = k0; // The comparison itself is not being masked.
12283     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12284     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12285   %}
12286   ins_pipe( pipe_slow );
12287 %}
12288 
12289 instruct vcmplt2I(vecD dst, vecD src1, vecD src2) %{
12290   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12291             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12292             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12293   match(Set dst (VectorMaskCmp src1 src2));
12294   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed2I" %}
12295   ins_encode %{
12296     int vector_len = 0;
12297     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12298   %}
12299   ins_pipe( pipe_slow );
12300 %}
12301 
12302 instruct vcmplt4I(vecX dst, vecX src1, vecX src2) %{
12303   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12304             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12305             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12306   match(Set dst (VectorMaskCmp src1 src2));
12307   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed4I" %}
12308   ins_encode %{
12309     int vector_len = 0;
12310     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12311   %}
12312   ins_pipe( pipe_slow );
12313 %}
12314 
12315 instruct vcmplt8I(vecY dst, vecY src1, vecY src2) %{
12316   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
12317             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12318             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12319   match(Set dst (VectorMaskCmp src1 src2));
12320   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed8I" %}
12321   ins_encode %{
12322     int vector_len = 1;
12323     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12324   %}
12325   ins_pipe( pipe_slow );
12326 %}
12327 
12328 instruct vcmplt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12329   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
12330             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12331             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12332   match(Set dst (VectorMaskCmp src1 src2));
12333   effect(TEMP dst, TEMP scratch);
12334   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
12335             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
12336   ins_encode %{
12337     int vector_len = 2;
12338     Assembler::ComparisonPredicate cmp = Assembler::lt;
12339     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12340     KRegister mask = k0; // The comparison itself is not being masked.
12341     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12342     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12343   %}
12344   ins_pipe( pipe_slow );
12345 %}
12346 
12347 instruct vcmpgt2I(vecD dst, vecD src1, vecD src2) %{
12348   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12349             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12350             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12351   match(Set dst (VectorMaskCmp src1 src2));
12352   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed2I" %}
12353   ins_encode %{
12354     int vector_len = 0;
12355     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12356   %}
12357   ins_pipe( pipe_slow );
12358 %}
12359 
12360 instruct vcmpgt4I(vecX dst, vecX src1, vecX src2) %{
12361   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12362             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12363             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12364   match(Set dst (VectorMaskCmp src1 src2));
12365   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed4I" %}
12366   ins_encode %{
12367     int vector_len = 0;
12368     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12369   %}
12370   ins_pipe( pipe_slow );
12371 %}
12372 
12373 instruct vcmpgt8I(vecY dst, vecY src1, vecY src2) %{
12374   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
12375             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12376             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12377   match(Set dst (VectorMaskCmp src1 src2));
12378   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed8I" %}
12379   ins_encode %{
12380     int vector_len = 1;
12381     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12382   %}
12383   ins_pipe( pipe_slow );
12384 %}
12385 
12386 instruct vcmpgt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12387   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
12388             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12389             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12390   match(Set dst (VectorMaskCmp src1 src2));
12391   effect(TEMP dst, TEMP scratch);
12392   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
12393             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
12394   ins_encode %{
12395     int vector_len = 2;
12396     Assembler::ComparisonPredicate cmp = Assembler::nle;
12397     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12398     KRegister mask = k0; // The comparison itself is not being masked.
12399     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12400     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12401   %}
12402   ins_pipe( pipe_slow );
12403 %}
12404 
12405 instruct vcmpge2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
12406   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12407             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12408             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12409   match(Set dst (VectorMaskCmp src1 src2));
12410   effect(TEMP scratch);
12411   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
12412             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed2I" %}
12413   ins_encode %{
12414     int vector_len = 0;
12415     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12416     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12417   %}
12418   ins_pipe( pipe_slow );
12419 %}
12420 
12421 instruct vcmpge4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
12422   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12423             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12424             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12425   match(Set dst (VectorMaskCmp src1 src2));
12426   effect(TEMP scratch);
12427   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
12428             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4I" %}
12429   ins_encode %{
12430     int vector_len = 0;
12431     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12432     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12433   %}
12434   ins_pipe( pipe_slow );
12435 %}
12436 
12437 instruct vcmpge8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
12438   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
12439             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12440             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12441   match(Set dst (VectorMaskCmp src1 src2));
12442   effect(TEMP scratch);
12443   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
12444             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8I" %}
12445   ins_encode %{
12446     int vector_len = 1;
12447     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12448     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12449   %}
12450   ins_pipe( pipe_slow );
12451 %}
12452 
12453 instruct vcmpge16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12454   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
12455             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12456             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12457   match(Set dst (VectorMaskCmp src1 src2));
12458   effect(TEMP dst, TEMP scratch);
12459   format %{ "vpcmpnltd  k2,$src1,$src2\n\t"
12460             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16I" %}
12461   ins_encode %{
12462     int vector_len = 2;
12463     Assembler::ComparisonPredicate cmp = Assembler::nlt;
12464     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12465     KRegister mask = k0; // The comparison itself is not being masked.
12466     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12467     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12468   %}
12469   ins_pipe( pipe_slow );
12470 %}
12471 
12472 instruct vcmple2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
12473   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12474             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12475             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12476   match(Set dst (VectorMaskCmp src1 src2));
12477   effect(TEMP scratch);
12478   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
12479             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed2I" %}
12480   ins_encode %{
12481     int vector_len = 0;
12482     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12483     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12484   %}
12485   ins_pipe( pipe_slow );
12486 %}
12487 
12488 instruct vcmple4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
12489   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12490             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12491             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12492   match(Set dst (VectorMaskCmp src1 src2));
12493   effect(TEMP scratch);
12494   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
12495             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4I" %}
12496   ins_encode %{
12497     int vector_len = 0;
12498     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12499     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12500   %}
12501   ins_pipe( pipe_slow );
12502 %}
12503 
12504 instruct vcmple8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
12505   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
12506             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12507             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12508   match(Set dst (VectorMaskCmp src1 src2));
12509   effect(TEMP scratch);
12510   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
12511             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8I" %}
12512   ins_encode %{
12513     int vector_len = 1;
12514     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12515     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12516   %}
12517   ins_pipe( pipe_slow );
12518 %}
12519 
12520 instruct vcmple16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12521   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
12522             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12523             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12524   match(Set dst (VectorMaskCmp src1 src2));
12525   effect(TEMP dst, TEMP scratch);
12526   format %{ "vpcmpled  k2,$src1,$src2\n\t"
12527             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16I" %}
12528   ins_encode %{
12529     int vector_len = 2;
12530     Assembler::ComparisonPredicate cmp = Assembler::le;
12531     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12532     KRegister mask = k0; // The comparison itself is not being masked.
12533     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12534     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12535   %}
12536   ins_pipe( pipe_slow );
12537 %}
12538 
12539 instruct vcmpne2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
12540   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
12541             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12542             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12543   match(Set dst (VectorMaskCmp src1 src2));
12544   effect(TEMP scratch);
12545   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
12546             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed2I" %}
12547   ins_encode %{
12548     int vector_len = 0;
12549     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12550     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12551   %}
12552   ins_pipe( pipe_slow );
12553 %}
12554 
12555 instruct vcmpne4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
12556   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12557             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12558             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12559   match(Set dst (VectorMaskCmp src1 src2));
12560   effect(TEMP scratch);
12561   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
12562             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4I" %}
12563   ins_encode %{
12564     int vector_len = 0;
12565     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12566     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12567   %}
12568   ins_pipe( pipe_slow );
12569 %}
12570 
12571 instruct vcmpne8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
12572   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
12573             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12574             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12575   match(Set dst (VectorMaskCmp src1 src2));
12576   effect(TEMP scratch);
12577   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
12578             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8I" %}
12579   ins_encode %{
12580     int vector_len = 1;
12581     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12582     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12583   %}
12584   ins_pipe( pipe_slow );
12585 %}
12586 
12587 instruct vcmpne16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12588   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
12589             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12590             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
12591   match(Set dst (VectorMaskCmp src1 src2));
12592   effect(TEMP dst, TEMP scratch);
12593   format %{ "vpcmpneqd  k2,$src1,$src2\n\t"
12594             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed16I" %}
12595   ins_encode %{
12596     int vector_len = 2;
12597     Assembler::ComparisonPredicate cmp = Assembler::neq;
12598     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12599     KRegister mask = k0; // The comparison itself is not being masked.
12600     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12601     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12602   %}
12603   ins_pipe( pipe_slow );
12604 %}
12605 
12606 instruct vcmpeq8B(vecD dst, vecD src1, vecD src2) %{
12607   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12608             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12609             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12610   match(Set dst (VectorMaskCmp src1 src2));
12611   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed8B" %}
12612   ins_encode %{
12613     int vector_len = 0;
12614     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12615   %}
12616   ins_pipe( pipe_slow );
12617 %}
12618 
12619 instruct vcmpeq16B(vecX dst, vecX src1, vecX src2) %{
12620   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
12621             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12622             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12623   match(Set dst (VectorMaskCmp src1 src2));
12624   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed16B" %}
12625   ins_encode %{
12626     int vector_len = 0;
12627     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12628   %}
12629   ins_pipe( pipe_slow );
12630 %}
12631 
12632 instruct vcmpeq32B(vecY dst, vecY src1, vecY src2) %{
12633   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
12634             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12635             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12636   match(Set dst (VectorMaskCmp src1 src2));
12637   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed32B" %}
12638   ins_encode %{
12639     int vector_len = 1;
12640     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12641   %}
12642   ins_pipe( pipe_slow );
12643 %}
12644 
12645 instruct vcmpeq64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12646   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
12647             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12648             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12649   match(Set dst (VectorMaskCmp src1 src2));
12650   effect(TEMP dst, TEMP scratch);
12651   format %{ "vpcmpeqb  k2,$src1,$src2\n\t"
12652             "vmovdqu8 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed64B" %}
12653   ins_encode %{
12654     int vector_len = 2;
12655     Assembler::ComparisonPredicate cmp = Assembler::eq;
12656     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12657     KRegister mask = k0; // The comparison itself is not being masked.
12658     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12659     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12660   %}
12661   ins_pipe( pipe_slow );
12662 %}
12663 
12664 instruct vcmplt8B(vecD dst, vecD src1, vecD src2) %{
12665   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12666             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12667             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12668   match(Set dst (VectorMaskCmp src1 src2));
12669   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed8B" %}
12670   ins_encode %{
12671     int vector_len = 0;
12672     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12673   %}
12674   ins_pipe( pipe_slow );
12675 %}
12676 
12677 instruct vcmplt16B(vecX dst, vecX src1, vecX src2) %{
12678   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
12679             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12680             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12681   match(Set dst (VectorMaskCmp src1 src2));
12682   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed16B" %}
12683   ins_encode %{
12684     int vector_len = 0;
12685     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12686   %}
12687   ins_pipe( pipe_slow );
12688 %}
12689 
12690 instruct vcmplt32B(vecY dst, vecY src1, vecY src2) %{
12691   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
12692             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12693             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12694   match(Set dst (VectorMaskCmp src1 src2));
12695   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed32B" %}
12696   ins_encode %{
12697     int vector_len = 1;
12698     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12699   %}
12700   ins_pipe( pipe_slow );
12701 %}
12702 
12703 instruct vcmplt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12704   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
12705             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
12706             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12707   match(Set dst (VectorMaskCmp src1 src2));
12708   effect(TEMP dst, TEMP scratch);
12709   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
12710             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
12711   ins_encode %{
12712     int vector_len = 2;
12713     Assembler::ComparisonPredicate cmp = Assembler::lt;
12714     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12715     KRegister mask = k0; // The comparison itself is not being masked.
12716     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12717     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12718   %}
12719   ins_pipe( pipe_slow );
12720 %}
12721 
12722 instruct vcmpgt8B(vecD dst, vecD src1, vecD src2) %{
12723   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12724             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12725             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12726   match(Set dst (VectorMaskCmp src1 src2));
12727   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed8B" %}
12728   ins_encode %{
12729     int vector_len = 0;
12730     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12731   %}
12732   ins_pipe( pipe_slow );
12733 %}
12734 
12735 instruct vcmpgt16B(vecX dst, vecX src1, vecX src2) %{
12736   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
12737             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12738             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12739   match(Set dst (VectorMaskCmp src1 src2));
12740   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed16B" %}
12741   ins_encode %{
12742     int vector_len = 0;
12743     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12744   %}
12745   ins_pipe( pipe_slow );
12746 %}
12747 
12748 instruct vcmpgt32B(vecY dst, vecY src1, vecY src2) %{
12749   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
12750             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12751             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12752   match(Set dst (VectorMaskCmp src1 src2));
12753   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed32B" %}
12754   ins_encode %{
12755     int vector_len = 1;
12756     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12757   %}
12758   ins_pipe( pipe_slow );
12759 %}
12760 
12761 instruct vcmpgt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12762   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
12763             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
12764             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12765   match(Set dst (VectorMaskCmp src1 src2));
12766   effect(TEMP dst, TEMP scratch);
12767   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
12768             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
12769   ins_encode %{
12770     int vector_len = 2;
12771     Assembler::ComparisonPredicate cmp = Assembler::nle;
12772     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12773     KRegister mask = k0; // The comparison itself is not being masked.
12774     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12775     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12776   %}
12777   ins_pipe( pipe_slow );
12778 %}
12779 
12780 instruct vcmpge8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
12781   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12782             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12783             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12784   match(Set dst (VectorMaskCmp src1 src2));
12785   effect(TEMP scratch);
12786   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
12787             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8B" %}
12788   ins_encode %{
12789     int vector_len = 0;
12790     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12791     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12792   %}
12793   ins_pipe( pipe_slow );
12794 %}
12795 
12796 instruct vcmpge16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
12797   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
12798             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12799             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12800   match(Set dst (VectorMaskCmp src1 src2));
12801   effect(TEMP scratch);
12802   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
12803             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16B" %}
12804   ins_encode %{
12805     int vector_len = 0;
12806     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12807     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12808   %}
12809   ins_pipe( pipe_slow );
12810 %}
12811 
12812 instruct vcmpge32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
12813   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
12814             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12815             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12816   match(Set dst (VectorMaskCmp src1 src2));
12817   effect(TEMP scratch);
12818   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
12819             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed32B" %}
12820   ins_encode %{
12821     int vector_len = 1;
12822     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
12823     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12824   %}
12825   ins_pipe( pipe_slow );
12826 %}
12827 
12828 instruct vcmpge64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12829   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
12830             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
12831             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12832   match(Set dst (VectorMaskCmp src1 src2));
12833   effect(TEMP dst, TEMP scratch);
12834   format %{ "vpcmpnltb  k2,$src1,$src2\n\t"
12835             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed64B" %}
12836   ins_encode %{
12837     int vector_len = 2;
12838     Assembler::ComparisonPredicate cmp = Assembler::nlt;
12839     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12840     KRegister mask = k0; // The comparison itself is not being masked.
12841     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12842     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12843   %}
12844   ins_pipe( pipe_slow );
12845 %}
12846 
12847 instruct vcmple8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
12848   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12849             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12850             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12851   match(Set dst (VectorMaskCmp src1 src2));
12852   effect(TEMP scratch);
12853   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
12854             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8B" %}
12855   ins_encode %{
12856     int vector_len = 0;
12857     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12858     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12859   %}
12860   ins_pipe( pipe_slow );
12861 %}
12862 
12863 instruct vcmple16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
12864   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
12865             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12866             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12867   match(Set dst (VectorMaskCmp src1 src2));
12868   effect(TEMP scratch);
12869   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
12870             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16B" %}
12871   ins_encode %{
12872     int vector_len = 0;
12873     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12874     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12875   %}
12876   ins_pipe( pipe_slow );
12877 %}
12878 
12879 instruct vcmple32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
12880   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
12881             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12882             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12883   match(Set dst (VectorMaskCmp src1 src2));
12884   effect(TEMP scratch);
12885   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
12886             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed32B" %}
12887   ins_encode %{
12888     int vector_len = 1;
12889     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12890     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12891   %}
12892   ins_pipe( pipe_slow );
12893 %}
12894 
12895 instruct vcmple64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12896   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
12897             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
12898             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12899   match(Set dst (VectorMaskCmp src1 src2));
12900   effect(TEMP dst, TEMP scratch);
12901   format %{ "vpcmpleb  k2,$src1,$src2\n\t"
12902             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed64B" %}
12903   ins_encode %{
12904     int vector_len = 2;
12905     Assembler::ComparisonPredicate cmp = Assembler::le;
12906     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12907     KRegister mask = k0; // The comparison itself is not being masked.
12908     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12909     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12910   %}
12911   ins_pipe( pipe_slow );
12912 %}
12913 
12914 instruct vcmpne8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
12915   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12916             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12917             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12918   match(Set dst (VectorMaskCmp src1 src2));
12919   effect(TEMP scratch);
12920   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
12921             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8B" %}
12922   ins_encode %{
12923     int vector_len = 0;
12924     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12925     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12926   %}
12927   ins_pipe( pipe_slow );
12928 %}
12929 
12930 instruct vcmpne16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
12931   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
12932             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12933             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12934   match(Set dst (VectorMaskCmp src1 src2));
12935   effect(TEMP scratch);
12936   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
12937             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16B" %}
12938   ins_encode %{
12939     int vector_len = 0;
12940     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12941     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12942   %}
12943   ins_pipe( pipe_slow );
12944 %}
12945 
12946 instruct vcmpne32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
12947   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
12948             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12949             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12950   match(Set dst (VectorMaskCmp src1 src2));
12951   effect(TEMP scratch);
12952   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
12953             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed32B" %}
12954   ins_encode %{
12955     int vector_len = 1;
12956     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12957     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
12958   %}
12959   ins_pipe( pipe_slow );
12960 %}
12961 
12962 instruct vcmpne64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
12963   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
12964             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
12965             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
12966   match(Set dst (VectorMaskCmp src1 src2));
12967   effect(TEMP dst, TEMP scratch);
12968   format %{ "vpcmpneqb  k2,$src1,$src2\n\t"
12969             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed64B" %}
12970   ins_encode %{
12971     int vector_len = 2;
12972     Assembler::ComparisonPredicate cmp = Assembler::neq;
12973     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
12974     KRegister mask = k0; // The comparison itself is not being masked.
12975     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
12976     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
12977   %}
12978   ins_pipe( pipe_slow );
12979 %}
12980 
12981 instruct vcmpeq4S(vecD dst, vecD src1, vecD src2) %{
12982   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
12983             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12984             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
12985   match(Set dst (VectorMaskCmp src1 src2));
12986   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed4S" %}
12987   ins_encode %{
12988     int vector_len = 0;
12989     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12990   %}
12991   ins_pipe( pipe_slow );
12992 %}
12993 
12994 instruct vcmpeq8S(vecX dst, vecX src1, vecX src2) %{
12995   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
12996             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
12997             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
12998   match(Set dst (VectorMaskCmp src1 src2));
12999   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed8S" %}
13000   ins_encode %{
13001     int vector_len = 0;
13002     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13003   %}
13004   ins_pipe( pipe_slow );
13005 %}
13006 
13007 instruct vcmpeq16S(vecY dst, vecY src1, vecY src2) %{
13008   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
13009             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
13010             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13011   match(Set dst (VectorMaskCmp src1 src2));
13012   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed16S" %}
13013   ins_encode %{
13014     int vector_len = 1;
13015     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13016   %}
13017   ins_pipe( pipe_slow );
13018 %}
13019 
13020 instruct vcmpeq32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13021   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
13022             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
13023             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13024   match(Set dst (VectorMaskCmp src1 src2));
13025   effect(TEMP dst, TEMP scratch);
13026   format %{ "vpcmpeqw  k2,$src1,$src2\n\t"
13027             "vmovdqu16 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed32S" %}
13028   ins_encode %{
13029     int vector_len = 2;
13030     Assembler::ComparisonPredicate cmp = Assembler::eq;
13031     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13032     KRegister mask = k0; // The comparison itself is not being masked.
13033     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13034     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13035   %}
13036   ins_pipe( pipe_slow );
13037 %}
13038 
13039 instruct vcmplt4S(vecD dst, vecD src1, vecD src2) %{
13040   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
13041             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13042             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13043   match(Set dst (VectorMaskCmp src1 src2));
13044   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed4S" %}
13045   ins_encode %{
13046     int vector_len = 0;
13047     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13048   %}
13049   ins_pipe( pipe_slow );
13050 %}
13051 
13052 instruct vcmplt8S(vecX dst, vecX src1, vecX src2) %{
13053   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
13054             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13055             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13056   match(Set dst (VectorMaskCmp src1 src2));
13057   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed8S" %}
13058   ins_encode %{
13059     int vector_len = 0;
13060     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13061   %}
13062   ins_pipe( pipe_slow );
13063 %}
13064 
13065 instruct vcmplt16S(vecY dst, vecY src1, vecY src2) %{
13066   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
13067             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13068             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13069   match(Set dst (VectorMaskCmp src1 src2));
13070   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed16S" %}
13071   ins_encode %{
13072     int vector_len = 1;
13073     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13074   %}
13075   ins_pipe( pipe_slow );
13076 %}
13077 
13078 instruct vcmplt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13079   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
13080             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13081             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13082   match(Set dst (VectorMaskCmp src1 src2));
13083   effect(TEMP dst, TEMP scratch);
13084   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
13085             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
13086   ins_encode %{
13087     int vector_len = 2;
13088     Assembler::ComparisonPredicate cmp = Assembler::lt;
13089     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13090     KRegister mask = k0; // The comparison itself is not being masked.
13091     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13092     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13093   %}
13094   ins_pipe( pipe_slow );
13095 %}
13096 
13097 instruct vcmpgt4S(vecD dst, vecD src1, vecD src2) %{
13098   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
13099             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13100             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13101   match(Set dst (VectorMaskCmp src1 src2));
13102   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed4S" %}
13103   ins_encode %{
13104     int vector_len = 0;
13105     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13106   %}
13107   ins_pipe( pipe_slow );
13108 %}
13109 
13110 instruct vcmpgt8S(vecX dst, vecX src1, vecX src2) %{
13111   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
13112             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13113             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13114   match(Set dst (VectorMaskCmp src1 src2));
13115   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed8S" %}
13116   ins_encode %{
13117     int vector_len = 0;
13118     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13119   %}
13120   ins_pipe( pipe_slow );
13121 %}
13122 
13123 instruct vcmpgt16S(vecY dst, vecY src1, vecY src2) %{
13124   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
13125             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13126             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13127   match(Set dst (VectorMaskCmp src1 src2));
13128   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed16S" %}
13129   ins_encode %{
13130     int vector_len = 1;
13131     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13132   %}
13133   ins_pipe( pipe_slow );
13134 %}
13135 
13136 instruct vcmpgt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13137   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
13138             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13139             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13140   match(Set dst (VectorMaskCmp src1 src2));
13141   effect(TEMP dst, TEMP scratch);
13142   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
13143             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
13144   ins_encode %{
13145     int vector_len = 2;
13146     Assembler::ComparisonPredicate cmp = Assembler::nle;
13147     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13148     KRegister mask = k0; // The comparison itself is not being masked.
13149     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13150     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13151   %}
13152   ins_pipe( pipe_slow );
13153 %}
13154 
13155 instruct vcmpge4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
13156   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
13157             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13158             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13159   match(Set dst (VectorMaskCmp src1 src2));
13160   effect(TEMP scratch);
13161   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
13162             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4S" %}
13163   ins_encode %{
13164     int vector_len = 0;
13165     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13166     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13167   %}
13168   ins_pipe( pipe_slow );
13169 %}
13170 
13171 instruct vcmpge8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
13172   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
13173             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13174             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13175   match(Set dst (VectorMaskCmp src1 src2));
13176   effect(TEMP scratch);
13177   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
13178             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8S" %}
13179   ins_encode %{
13180     int vector_len = 0;
13181     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13182     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13183   %}
13184   ins_pipe( pipe_slow );
13185 %}
13186 
13187 instruct vcmpge16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
13188   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
13189             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13190             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13191   match(Set dst (VectorMaskCmp src1 src2));
13192   effect(TEMP scratch);
13193   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
13194             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16S" %}
13195   ins_encode %{
13196     int vector_len = 1;
13197     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13198     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13199   %}
13200   ins_pipe( pipe_slow );
13201 %}
13202 
13203 instruct vcmpge32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13204   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
13205             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13206             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13207   match(Set dst (VectorMaskCmp src1 src2));
13208   effect(TEMP dst, TEMP scratch);
13209   format %{ "vpcmpnltw  k2,$src1,$src2\n\t"
13210             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed32S" %}
13211   ins_encode %{
13212     int vector_len = 2;
13213     Assembler::ComparisonPredicate cmp = Assembler::nlt;
13214     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13215     KRegister mask = k0; // The comparison itself is not being masked.
13216     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13217     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13218   %}
13219   ins_pipe( pipe_slow );
13220 %}
13221 
13222 instruct vcmple4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
13223   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
13224             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13225             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13226   match(Set dst (VectorMaskCmp src1 src2));
13227   effect(TEMP scratch);
13228   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
13229             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4S" %}
13230   ins_encode %{
13231     int vector_len = 0;
13232     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13233     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13234   %}
13235   ins_pipe( pipe_slow );
13236 %}
13237 
13238 instruct vcmple8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
13239   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
13240             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13241             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13242   match(Set dst (VectorMaskCmp src1 src2));
13243   effect(TEMP scratch);
13244   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
13245             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8S" %}
13246   ins_encode %{
13247     int vector_len = 0;
13248     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13249     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13250   %}
13251   ins_pipe( pipe_slow );
13252 %}
13253 
13254 instruct vcmple16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
13255   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
13256             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13257             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13258   match(Set dst (VectorMaskCmp src1 src2));
13259   effect(TEMP scratch);
13260   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
13261             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16S" %}
13262   ins_encode %{
13263     int vector_len = 1;
13264     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13265     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13266   %}
13267   ins_pipe( pipe_slow );
13268 %}
13269 
13270 instruct vcmple32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13271   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
13272             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13273             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13274   match(Set dst (VectorMaskCmp src1 src2));
13275   effect(TEMP dst, TEMP scratch);
13276   format %{ "vpcmplew  k2,$src1,$src2\n\t"
13277             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed32S" %}
13278   ins_encode %{
13279     int vector_len = 2;
13280     Assembler::ComparisonPredicate cmp = Assembler::le;
13281     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13282     KRegister mask = k0; // The comparison itself is not being masked.
13283     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13284     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13285   %}
13286   ins_pipe( pipe_slow );
13287 %}
13288 
13289 instruct vcmpne4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
13290   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
13291             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13292             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13293   match(Set dst (VectorMaskCmp src1 src2));
13294   effect(TEMP scratch);
13295   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
13296             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4S" %}
13297   ins_encode %{
13298     int vector_len = 0;
13299     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13300     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13301   %}
13302   ins_pipe( pipe_slow );
13303 %}
13304 
13305 instruct vcmpne8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
13306   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
13307             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13308             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13309   match(Set dst (VectorMaskCmp src1 src2));
13310   effect(TEMP scratch);
13311   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
13312             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8S" %}
13313   ins_encode %{
13314     int vector_len = 0;
13315     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13316     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13317   %}
13318   ins_pipe( pipe_slow );
13319 %}
13320 
13321 instruct vcmpne16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
13322   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
13323             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13324             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13325   match(Set dst (VectorMaskCmp src1 src2));
13326   effect(TEMP scratch);
13327   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
13328             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16S" %}
13329   ins_encode %{
13330     int vector_len = 1;
13331     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13332     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13333   %}
13334   ins_pipe( pipe_slow );
13335 %}
13336 
13337 instruct vcmpne32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13338   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
13339             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13340             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13341   match(Set dst (VectorMaskCmp src1 src2));
13342   effect(TEMP dst, TEMP scratch);
13343   format %{ "vpcmpneqw  k2,$src1,$src2\n\t"
13344             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed32S" %}
13345   ins_encode %{
13346     int vector_len = 2;
13347     Assembler::ComparisonPredicate cmp = Assembler::neq;
13348     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13349     KRegister mask = k0; // The comparison itself is not being masked.
13350     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13351     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13352   %}
13353   ins_pipe( pipe_slow );
13354 %}
13355 
13356 instruct vcmpeq1L(vecD dst, vecD src1, vecD src2) %{
13357   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
13358             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
13359             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13360   match(Set dst (VectorMaskCmp src1 src2));
13361   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed1L" %}
13362   ins_encode %{
13363     int vector_len = 0;
13364     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13365   %}
13366   ins_pipe( pipe_slow );
13367 %}
13368 
13369 instruct vcmpeq2L(vecX dst, vecX src1, vecX src2) %{
13370   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
13371             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
13372             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13373   match(Set dst (VectorMaskCmp src1 src2));
13374   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed2L" %}
13375   ins_encode %{
13376     int vector_len = 0;
13377     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13378   %}
13379   ins_pipe( pipe_slow );
13380 %}
13381 
13382 instruct vcmpeq4L(vecY dst, vecY src1, vecY src2) %{
13383   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
13384             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
13385             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13386   match(Set dst (VectorMaskCmp src1 src2));
13387   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed4L" %}
13388   ins_encode %{
13389     int vector_len = 1;
13390     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13391   %}
13392   ins_pipe( pipe_slow );
13393 %}
13394 
13395 instruct vcmpeq8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13396   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
13397             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
13398             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13399   match(Set dst (VectorMaskCmp src1 src2));
13400   effect(TEMP dst, TEMP scratch);
13401   format %{ "vpcmpeqq  k2,$src1,$src2\n\t"
13402             "vmovdqu64 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8L" %}
13403   ins_encode %{
13404     int vector_len = 2;
13405     Assembler::ComparisonPredicate cmp = Assembler::eq;
13406     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13407     KRegister mask = k0; // The comparison itself is not being masked.
13408     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13409     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13410   %}
13411   ins_pipe( pipe_slow );
13412 %}
13413 
13414 instruct vcmplt1L(vecD dst, vecD src1, vecD src2) %{
13415   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
13416             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13417             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13418   match(Set dst (VectorMaskCmp src1 src2));
13419   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed1L" %}
13420   ins_encode %{
13421     int vector_len = 0;
13422     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13423   %}
13424   ins_pipe( pipe_slow );
13425 %}
13426 
13427 instruct vcmplt2L(vecX dst, vecX src1, vecX src2) %{
13428   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
13429             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13430             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13431   match(Set dst (VectorMaskCmp src1 src2));
13432   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed2L" %}
13433   ins_encode %{
13434     int vector_len = 0;
13435     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13436   %}
13437   ins_pipe( pipe_slow );
13438 %}
13439 
13440 instruct vcmplt4L(vecY dst, vecY src1, vecY src2) %{
13441   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
13442             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13443             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13444   match(Set dst (VectorMaskCmp src1 src2));
13445   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed4L" %}
13446   ins_encode %{
13447     int vector_len = 1;
13448     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13449   %}
13450   ins_pipe( pipe_slow );
13451 %}
13452 
13453 instruct vcmplt8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13454   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
13455             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
13456             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13457   match(Set dst (VectorMaskCmp src1 src2));
13458   effect(TEMP dst, TEMP scratch);
13459   format %{ "vpcmpnleq  k2,$src1,$src2\n\t"
13460             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8L" %}
13461   ins_encode %{
13462     int vector_len = 2;
13463     Assembler::ComparisonPredicate cmp = Assembler::lt;
13464     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13465     KRegister mask = k0; // The comparison itself is not being masked.
13466     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13467     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13468   %}
13469   ins_pipe( pipe_slow );
13470 %}
13471 
13472 instruct vcmpgt1L(vecD dst, vecD src1, vecD src2) %{
13473   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
13474             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13475             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13476   match(Set dst (VectorMaskCmp src1 src2));
13477   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed1L" %}
13478   ins_encode %{
13479     int vector_len = 0;
13480     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13481   %}
13482   ins_pipe( pipe_slow );
13483 %}
13484 
13485 instruct vcmpgt2L(vecX dst, vecX src1, vecX src2) %{
13486   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
13487             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13488             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13489   match(Set dst (VectorMaskCmp src1 src2));
13490   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed2L" %}
13491   ins_encode %{
13492     int vector_len = 0;
13493     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13494   %}
13495   ins_pipe( pipe_slow );
13496 %}
13497 
13498 instruct vcmpgt4L(vecY dst, vecY src1, vecY src2) %{
13499   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
13500             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13501             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13502   match(Set dst (VectorMaskCmp src1 src2));
13503   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed4L" %}
13504   ins_encode %{
13505     int vector_len = 1;
13506     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13507   %}
13508   ins_pipe( pipe_slow );
13509 %}
13510 
13511 instruct vcmpgt8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13512   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
13513             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
13514             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13515   match(Set dst (VectorMaskCmp src1 src2));
13516   effect(TEMP dst, TEMP scratch);
13517   format %{ "vpcmpnleq  k2,$src1,$src2\n\t"
13518             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8L" %}
13519   ins_encode %{
13520     int vector_len = 2;
13521     Assembler::ComparisonPredicate cmp = Assembler::nle;
13522     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13523     KRegister mask = k0; // The comparison itself is not being masked.
13524     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13525     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13526   %}
13527   ins_pipe( pipe_slow );
13528 %}
13529 
13530 instruct vcmpge1L(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
13531   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
13532             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13533             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13534   match(Set dst (VectorMaskCmp src1 src2));
13535   effect(TEMP scratch);
13536   format %{ "vpcmpgtq  $dst,$src2,$src1\n\t"
13537             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed1L" %}
13538   ins_encode %{
13539     int vector_len = 0;
13540     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13541     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13542   %}
13543   ins_pipe( pipe_slow );
13544 %}
13545 
13546 instruct vcmpge2L(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
13547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
13548             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13549             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13550   match(Set dst (VectorMaskCmp src1 src2));
13551   effect(TEMP scratch);
13552   format %{ "vpcmpgtq  $dst,$src2,$src1\n\t"
13553             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed2L" %}
13554   ins_encode %{
13555     int vector_len = 0;
13556     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13557     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13558   %}
13559   ins_pipe( pipe_slow );
13560 %}
13561 
13562 instruct vcmpge4L(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
13563   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
13564             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13565             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13566   match(Set dst (VectorMaskCmp src1 src2));
13567   effect(TEMP scratch);
13568   format %{ "vpcmpgtq  $dst,$src2,$src1\n\t"
13569             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4L" %}
13570   ins_encode %{
13571     int vector_len = 1;
13572     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
13573     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13574   %}
13575   ins_pipe( pipe_slow );
13576 %}
13577 
13578 instruct vcmpge8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13579   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
13580             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
13581             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13582   match(Set dst (VectorMaskCmp src1 src2));
13583   effect(TEMP dst, TEMP scratch);
13584   format %{ "vpcmpnltq  k2,$src1,$src2\n\t"
13585             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed8L" %}
13586   ins_encode %{
13587     int vector_len = 2;
13588     Assembler::ComparisonPredicate cmp = Assembler::nlt;
13589     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13590     KRegister mask = k0; // The comparison itself is not being masked.
13591     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13592     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13593   %}
13594   ins_pipe( pipe_slow );
13595 %}
13596 
13597 instruct vcmple1L(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
13598   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
13599             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13600             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13601   match(Set dst (VectorMaskCmp src1 src2));
13602   effect(TEMP scratch);
13603   format %{ "vpcmpgtq  $dst,$src1,$src2\n\t"
13604             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed1L" %}
13605   ins_encode %{
13606     int vector_len = 0;
13607     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13608     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13609   %}
13610   ins_pipe( pipe_slow );
13611 %}
13612 
13613 instruct vcmple2L(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
13614   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
13615             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13616             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13617   match(Set dst (VectorMaskCmp src1 src2));
13618   effect(TEMP scratch);
13619   format %{ "vpcmpgtq  $dst,$src1,$src2\n\t"
13620             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed2L" %}
13621   ins_encode %{
13622     int vector_len = 0;
13623     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13624     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13625   %}
13626   ins_pipe( pipe_slow );
13627 %}
13628 
13629 instruct vcmple4L(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
13630   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
13631             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13632             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13633   match(Set dst (VectorMaskCmp src1 src2));
13634   effect(TEMP scratch);
13635   format %{ "vpcmpgtq  $dst,$src1,$src2\n\t"
13636             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4L" %}
13637   ins_encode %{
13638     int vector_len = 1;
13639     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13640     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13641   %}
13642   ins_pipe( pipe_slow );
13643 %}
13644 
13645 instruct vcmple8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13646   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
13647             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
13648             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13649   match(Set dst (VectorMaskCmp src1 src2));
13650   effect(TEMP dst, TEMP scratch);
13651   format %{ "vpcmpleq  k2,$src1,$src2\n\t"
13652             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed8L" %}
13653   ins_encode %{
13654     int vector_len = 2;
13655     Assembler::ComparisonPredicate cmp = Assembler::le;
13656     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13657     KRegister mask = k0; // The comparison itself is not being masked.
13658     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13659     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13660   %}
13661   ins_pipe( pipe_slow );
13662 %}
13663 
13664 instruct vcmpne1L(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
13665   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
13666             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13667             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13668   match(Set dst (VectorMaskCmp src1 src2));
13669   effect(TEMP scratch);
13670   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t"
13671             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed1L" %}
13672   ins_encode %{
13673     int vector_len = 0;
13674     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13675     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13676   %}
13677   ins_pipe( pipe_slow );
13678 %}
13679 
13680 instruct vcmpne2L(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
13681   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
13682             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13683             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13684   match(Set dst (VectorMaskCmp src1 src2));
13685   effect(TEMP scratch);
13686   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t"
13687             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed2L" %}
13688   ins_encode %{
13689     int vector_len = 0;
13690     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13691     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13692   %}
13693   ins_pipe( pipe_slow );
13694 %}
13695 
13696 instruct vcmpne4L(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
13697   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
13698             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13699             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13700   match(Set dst (VectorMaskCmp src1 src2));
13701   effect(TEMP scratch);
13702   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t"
13703             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4L" %}
13704   ins_encode %{
13705     int vector_len = 1;
13706     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13707     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13708   %}
13709   ins_pipe( pipe_slow );
13710 %}
13711 
13712 instruct vcmpne8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
13713   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
13714             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
13715             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13716   match(Set dst (VectorMaskCmp src1 src2));
13717   effect(TEMP dst, TEMP scratch);
13718   format %{ "vpcmpneqq  k2,$src1,$src2\n\t"
13719             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed8L" %}
13720   ins_encode %{
13721     int vector_len = 2;
13722     Assembler::ComparisonPredicate cmp = Assembler::neq;
13723     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13724     KRegister mask = k0; // The comparison itself is not being masked.
13725     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
13726     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
13727   %}
13728   ins_pipe( pipe_slow );
13729 %}
13730 
13731 instruct blendvps2F(vecD dst, vecD src, rxmm0 mask) %{
13732   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
13733   match(Set dst (VectorBlend (Binary dst src) mask));
13734   format %{ "blendvps  $dst,$src,$mask\t! packed2F" %}
13735   ins_encode %{
13736     __ blendvps($dst$$XMMRegister, $src$$XMMRegister);
13737   %}
13738   ins_pipe( pipe_slow );
13739 %}
13740 
13741 instruct vblendvps2F(vecD dst, vecD src1, vecD src2, vecD mask) %{
13742   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
13743   match(Set dst (VectorBlend (Binary src1 src2) mask));
13744   format %{ "vblendvps  $dst,$src1,$src2,$mask\t! packed2F" %}
13745   ins_encode %{
13746     int vector_len = 0;
13747     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13748   %}
13749   ins_pipe( pipe_slow );
13750 %}
13751 
13752 instruct blendvps4F(vecX dst, vecX src, rxmm0 mask) %{
13753   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
13754   match(Set dst (VectorBlend (Binary dst src) mask));
13755   format %{ "blendvps  $dst,$src,$mask\t! packed4F" %}
13756   ins_encode %{
13757     __ blendvps($dst$$XMMRegister, $src$$XMMRegister);
13758   %}
13759   ins_pipe( pipe_slow );
13760 %}
13761 
13762 instruct vblendvps4F(vecX dst, vecX src1, vecX src2, vecX mask) %{
13763   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
13764   match(Set dst (VectorBlend (Binary src1 src2) mask));
13765   format %{ "vblendvps  $dst,$src1,$src2,$mask\t! packed4F" %}
13766   ins_encode %{
13767     int vector_len = 0;
13768     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13769   %}
13770   ins_pipe( pipe_slow );
13771 %}
13772 
13773 instruct vblendvps8F(vecY dst, vecY src1, vecY src2, vecY mask) %{
13774   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
13775   match(Set dst (VectorBlend (Binary src1 src2) mask));
13776   format %{ "vblendvps  $dst,$src1,$src2,$mask\t! packed8F" %}
13777   ins_encode %{
13778     int vector_len = 1;
13779     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13780   %}
13781   ins_pipe( pipe_slow );
13782 %}
13783 
13784 instruct vblendvps16F(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
13785   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
13786   match(Set dst (VectorBlend (Binary src1 src2) mask));
13787   effect(TEMP scratch);
13788   format %{ "vpcmpeqd  k2,$mask,0xFFFFFFFF\n\t"
13789            "vblendmps $dst,k2,$src1,$src2\t! blend packed16F " %}
13790   ins_encode %{
13791     int vector_len = 2;
13792     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13793     __ evpcmpeqd(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
13794     __ evblendmps($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
13795   %}
13796   ins_pipe( pipe_slow );
13797 %}
13798 
13799 instruct vblendvpd8D(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
13800   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
13801   match(Set dst (VectorBlend (Binary src1 src2) mask));
13802   effect(TEMP scratch);
13803   format %{ "evpcmpeqq  k2,$mask,0xFFFFFFFF\n\t"
13804            "vblendmpd $dst,k2,$src1,$src2\t! blend packed16F " %}
13805   ins_encode %{
13806     int vector_len = 2;
13807     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13808     __ evpcmpq(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
13809     __ evblendmpd($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
13810   %}
13811   ins_pipe( pipe_slow );
13812 %}
13813 
13814 instruct vpblendmb64B(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
13815   predicate(UseAVX > 2 && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE && VM_Version::supports_avx512bw());
13816   match(Set dst (VectorBlend (Binary src1 src2) mask));
13817   effect(TEMP scratch);
13818   format %{ "vpcmpeqb  k2,$mask,0xFFFFFFFF\n\t"
13819            "vpblendmb $dst,k2,$src1,$src2\t! blend packed64B " %}
13820   ins_encode %{
13821     int vector_len = 2;
13822     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13823     __ evpcmpb(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
13824     __ evpblendmb($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
13825   %}
13826   ins_pipe( pipe_slow );
13827 %}
13828 
13829 instruct vpblendmw32S(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
13830   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT && VM_Version::supports_avx512bw());
13831   match(Set dst (VectorBlend (Binary src1 src2) mask));
13832   effect(TEMP scratch);
13833   format %{ "vpcmpeqw  k2,$mask,0xFFFFFFFF\n\t"
13834            "vpblendmw $dst,k2,$src1,$src2\t! blend packed32S " %}
13835   ins_encode %{
13836     int vector_len = 2;
13837     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13838     __ evpcmpw(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
13839     __ evpblendmw($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
13840   %}
13841   ins_pipe( pipe_slow );
13842 %}
13843 
13844 instruct vpblendmd16I(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
13845   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
13846   match(Set dst (VectorBlend (Binary src1 src2) mask));
13847   effect(TEMP scratch);
13848   format %{ "vpcmpeqd  k2,$mask,0xFFFFFFFF\n\t"
13849            "vpblendmd $dst,k2,$src1,$src2\t! blend packed16I " %}
13850   ins_encode %{
13851     int vector_len = 2;
13852     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13853     __ evpcmpd(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
13854     __ evpblendmd($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
13855   %}
13856   ins_pipe( pipe_slow );
13857 %}
13858 
13859 instruct vpblendmq8L(vecZ dst, vecZ src1, vecZ src2, vecZ mask, rRegL scratch) %{
13860   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
13861   match(Set dst (VectorBlend (Binary src1 src2) mask));
13862   effect(TEMP scratch);
13863   format %{ "vpcmpeqq  k2,$mask,0xFFFFFFFF\n\t"
13864            "vpblendmq $dst,k2,$src1,$src2\t! blend packed8L " %}
13865   ins_encode %{
13866     int vector_len = 2;
13867     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
13868     __ evpcmpq(ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vector_len, $scratch$$Register);
13869     __ evpblendmq($dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vector_len);
13870   %}
13871   ins_pipe( pipe_slow );
13872 %}
13873 
13874 
13875 instruct pblendvb2I(vecD dst, vecD src, rxmm0 mask) %{
13876   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
13877   match(Set dst (VectorBlend (Binary dst src) mask));
13878   format %{ "vpblendvb  $dst,$src,$mask\t! blend packed2I" %}
13879   ins_encode %{
13880     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
13881   %}
13882   ins_pipe( pipe_slow );
13883 %}
13884 
13885 instruct vpblendvb2I(vecD dst, vecD src1, vecD src2, vecD mask) %{
13886   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
13887   match(Set dst (VectorBlend (Binary src1 src2) mask));
13888   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed2I" %}
13889   ins_encode %{
13890     int vector_len = 0;
13891     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13892   %}
13893   ins_pipe( pipe_slow );
13894 %}
13895 
13896 instruct pblendvb4I(vecX dst, vecX src, rxmm0 mask) %{
13897   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
13898   match(Set dst (VectorBlend (Binary dst src) mask));
13899   format %{ "vpblendvb  $dst,$src,$mask\t! blend packed4I" %}
13900   ins_encode %{
13901     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
13902   %}
13903   ins_pipe( pipe_slow );
13904 %}
13905 
13906 instruct vpblendvb4I(vecX dst, vecX src1, vecX src2, vecX mask) %{
13907   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
13908   match(Set dst (VectorBlend (Binary src1 src2) mask));
13909   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed4I" %}
13910   ins_encode %{
13911     int vector_len = 0;
13912     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13913   %}
13914   ins_pipe( pipe_slow );
13915 %}
13916 
13917 instruct vpblendvb8I(vecY dst, vecY src1, vecY src2, vecY mask) %{
13918   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
13919   match(Set dst (VectorBlend (Binary src1 src2) mask));
13920   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed8I" %}
13921   ins_encode %{
13922     int vector_len = 1;
13923     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13924   %}
13925   ins_pipe( pipe_slow );
13926 %}
13927 
13928 instruct pblendvb8B(vecD dst, vecD src, rxmm0 mask) %{
13929   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
13930   match(Set dst (VectorBlend (Binary dst src) mask));
13931   format %{ "pblendvb  $dst,$src,$mask\t! blend packed8B" %}
13932   ins_encode %{
13933     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
13934   %}
13935   ins_pipe( pipe_slow );
13936 %}
13937 
13938 instruct vpblendvb8B(vecD dst, vecD src1, vecD src2, vecD mask) %{
13939   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
13940   match(Set dst (VectorBlend (Binary src1 src2) mask));
13941   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed8B" %}
13942   ins_encode %{
13943     int vector_len = 0;
13944     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13945   %}
13946   ins_pipe( pipe_slow );
13947 %}
13948 
13949 instruct pblendvb16B(vecX dst, vecX src, rxmm0 mask) %{
13950   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
13951   match(Set dst (VectorBlend (Binary dst src) mask));
13952   format %{ "pblendvb  $dst,$src,$mask\t! blend packed16B" %}
13953   ins_encode %{
13954     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
13955   %}
13956   ins_pipe( pipe_slow );
13957 %}
13958 
13959 instruct vpblendvb16B(vecX dst, vecX src1, vecX src2, vecX mask) %{
13960   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
13961   match(Set dst (VectorBlend (Binary src1 src2) mask));
13962   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed16B" %}
13963   ins_encode %{
13964     int vector_len = 0;
13965     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13966   %}
13967   ins_pipe( pipe_slow );
13968 %}
13969 
13970 instruct vpblendvb32B(vecY dst, vecY src1, vecY src2, vecY mask) %{
13971   predicate(UseAVX >= 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
13972   match(Set dst (VectorBlend (Binary src1 src2) mask));
13973   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed32B" %}
13974   ins_encode %{
13975     int vector_len = 1;
13976     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13977   %}
13978   ins_pipe( pipe_slow );
13979 %}
13980 
13981 instruct pblendvb4S(vecD dst, vecD src, rxmm0 mask) %{
13982   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13983   match(Set dst (VectorBlend (Binary dst src) mask));
13984   format %{ "pblendvb  $dst,$src,$mask\t! blend packed4S" %}
13985   ins_encode %{
13986     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
13987   %}
13988   ins_pipe( pipe_slow );
13989 %}
13990 
13991 instruct vpblendvb4S(vecD dst, vecD src1, vecD src2, vecD mask) %{
13992   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
13993   match(Set dst (VectorBlend (Binary src1 src2) mask));
13994   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed4S" %}
13995   ins_encode %{
13996     int vector_len = 0;
13997     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
13998   %}
13999   ins_pipe( pipe_slow );
14000 %}
14001 
14002 instruct pblendvb8S(vecX dst, vecX src, rxmm0 mask) %{
14003   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14004   match(Set dst (VectorBlend (Binary dst src) mask));
14005   format %{ "pblendvb  $dst,$src,$mask\t! blend packed8S" %}
14006   ins_encode %{
14007     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
14008   %}
14009   ins_pipe( pipe_slow );
14010 %}
14011 
14012 instruct vpblendvb8S(vecX dst, vecX src1, vecX src2, vecX mask) %{
14013   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14014   match(Set dst (VectorBlend (Binary src1 src2) mask));
14015   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed8S" %}
14016   ins_encode %{
14017     int vector_len = 0;
14018     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14019   %}
14020   ins_pipe( pipe_slow );
14021 %}
14022 
14023 instruct vpblendvb16S(vecY dst, vecY src1, vecY src2, vecY mask) %{
14024   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14025   match(Set dst (VectorBlend (Binary src1 src2) mask));
14026   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed16S" %}
14027   ins_encode %{
14028     int vector_len = 1;
14029     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14030   %}
14031   ins_pipe( pipe_slow );
14032 %}
14033 
14034 instruct pblendvb1L(vecD dst, vecD src, rxmm0 mask) %{
14035   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14036   match(Set dst (VectorBlend (Binary dst src) mask));
14037   format %{ "pblendvb  $dst,$src,$mask\t! blend packed1L" %}
14038   ins_encode %{
14039     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
14040   %}
14041   ins_pipe( pipe_slow );
14042 %}
14043 
14044 instruct vpblendvb1L(vecD dst, vecD src1, vecD src2, vecD mask) %{
14045   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14046   match(Set dst (VectorBlend (Binary src1 src2) mask));
14047   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed1L" %}
14048   ins_encode %{
14049     int vector_len = 0;
14050     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14051   %}
14052   ins_pipe( pipe_slow );
14053 %}
14054 
14055 instruct pblendvb2L(vecX dst, vecX src, rxmm0 mask) %{
14056   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14057   match(Set dst (VectorBlend (Binary dst src) mask));
14058   format %{ "pblendvb  $dst,$src,$mask\t! blend packed2L" %}
14059   ins_encode %{
14060     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister);
14061   %}
14062   ins_pipe( pipe_slow );
14063 %}
14064 
14065 instruct vpblendvb2L(vecX dst, vecX src1, vecX src2, vecX mask) %{
14066   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14067   match(Set dst (VectorBlend (Binary src1 src2) mask));
14068   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed2L" %}
14069   ins_encode %{
14070     int vector_len = 0;
14071     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14072   %}
14073   ins_pipe( pipe_slow );
14074 %}
14075 
14076 instruct vpblendvb4L(vecY dst, vecY src1, vecY src2, vecY mask) %{
14077   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14078   match(Set dst (VectorBlend (Binary src1 src2) mask));
14079   format %{ "vpblendvb  $dst,$src1,$src2,$mask\t! blend packed4L" %}
14080   ins_encode %{
14081     int vector_len = 1;
14082     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14083   %}
14084   ins_pipe( pipe_slow );
14085 %}
14086 
14087 instruct blendvpd1D(vecD dst, vecD src, rxmm0 mask) %{
14088   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14089   match(Set dst (VectorBlend (Binary dst src) mask));
14090   format %{ "blendvpd  $dst,$src,$mask\t! packed1D" %}
14091   ins_encode %{
14092     __ blendvpd($dst$$XMMRegister, $src$$XMMRegister);
14093   %}
14094   ins_pipe( pipe_slow );
14095 %}
14096 
14097 instruct vblendvpd1D(vecD dst, vecD src1, vecD src2, vecD mask) %{
14098   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14099   match(Set dst (VectorBlend (Binary src1 src2) mask));
14100   format %{ "vblendvpd  $dst,$src1,$src2,$mask\t! packed1D" %}
14101   ins_encode %{
14102     int vector_len = 0;
14103     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14104   %}
14105   ins_pipe( pipe_slow );
14106 %}
14107 
14108 instruct blendvpd2D(vecX dst, vecX src, rxmm0 mask) %{
14109   predicate(UseAVX == 0 && UseSSE > 3 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14110   match(Set dst (VectorBlend (Binary dst src) mask));
14111   format %{ "blendvpd  $dst,$src,$mask\t! packed2D" %}
14112   ins_encode %{
14113     __ blendvpd($dst$$XMMRegister, $src$$XMMRegister);
14114   %}
14115   ins_pipe( pipe_slow );
14116 %}
14117 
14118 instruct vblendvpd2D(vecX dst, vecX src1, vecX src2, vecX mask) %{
14119   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14120   match(Set dst (VectorBlend (Binary src1 src2) mask));
14121   format %{ "vblendvpd  $dst,$src1,$src2,$mask\t! packed2D" %}
14122   ins_encode %{
14123     int vector_len = 0;
14124     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14125   %}
14126   ins_pipe( pipe_slow );
14127 %}
14128 
14129 instruct vblendvpd4D(vecY dst, vecY src1, vecY src2, vecY mask) %{
14130   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14131   match(Set dst (VectorBlend (Binary src1 src2) mask));
14132   format %{ "vblendvpd  $dst,$src1,$src2,$mask\t! packed4D" %}
14133   ins_encode %{
14134     int vector_len = 1;
14135     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vector_len);
14136   %}
14137   ins_pipe( pipe_slow );
14138 %}
14139 
14140 // --------------------------------- NEG --------------------------------------
14141 // a = -a
14142 instruct vneg2I_reg(vecD dst, vecD src) %{
14143   predicate(UseSSE > 1 && n->as_Vector()->length() == 2);
14144   match(Set dst (NegVI  src));
14145   effect(TEMP dst);
14146   format %{ "pxor   $dst,$dst\n\t"
14147             "psubd  $dst, $src\t! neg packed2I" %}
14148   ins_cost(150);
14149   ins_encode %{
14150     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14151     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
14152   %}
14153   ins_pipe( pipe_slow );
14154 %}
14155 
14156 instruct vneg4I_reg(vecX dst, vecX src) %{
14157   predicate(UseSSE > 1 && n->as_Vector()->length() == 4);
14158   match(Set dst (NegVI  src));
14159   effect(TEMP dst);
14160   format %{ "pxor   $dst,$dst\n\t"
14161             "psubd  $dst, $src\t! neg packed4I" %}
14162   ins_cost(150);
14163   ins_encode %{
14164     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14165     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
14166   %}
14167   ins_pipe( pipe_slow );
14168 %}
14169 
14170 instruct vneg8I_reg(vecY dst, vecY src, vecY tmp) %{
14171   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14172   match(Set dst (NegVI  src));
14173   effect(TEMP tmp);
14174   format %{ "vpxor   $tmp,$tmp,$tmp\n\t"
14175             "vpsubd  $dst,$tmp,$src\t! neg packed8I" %}
14176   ins_cost(150);
14177   ins_encode %{
14178     int vector_len = 1;
14179     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
14180     __ vpsubd($dst$$XMMRegister, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
14181   %}
14182   ins_pipe( pipe_slow );
14183 %}
14184 
14185 instruct vneg16I_reg(vecZ dst, vecZ src, vecZ tmp) %{
14186   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14187   match(Set dst (NegVI  src));
14188   effect(TEMP tmp);
14189   format %{ "vpxor   $tmp,$tmp,$tmp\n\t"
14190             "vpsubd  $dst,$tmp,$src\t! neg packed16I" %}
14191   ins_cost(150);
14192   ins_encode %{
14193     int vector_len = 2;
14194     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
14195     __ vpsubd($dst$$XMMRegister, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
14196   %}
14197   ins_pipe( pipe_slow );
14198 %}
14199 
14200 instruct vneg1D(regD dst) %{
14201   predicate((UseSSE>=2) && (UseAVX == 0));
14202   match(Set dst (NegVD dst));
14203   ins_cost(150);
14204   format %{ "xorpd $dst,[0x8000000000000000] \t# $dst = -$dst neg packed1D" %}
14205   ins_encode %{
14206     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
14207   %}
14208   ins_pipe(pipe_slow);
14209 %}
14210 
14211 instruct vneg1D_reg(vecX dst, vecX src) %{
14212   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
14213   match(Set dst (NegVD  src));
14214   format %{ "vxorpd $dst,$src\t# $dst = -$src neg packed1D" %}
14215   ins_cost(150);
14216   ins_encode %{
14217     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
14218                  ExternalAddress(double_signflip()));
14219   %}
14220   ins_pipe( pipe_slow );
14221 %}
14222 
14223 instruct vneg2D_reg(vecX dst) %{
14224   predicate((UseSSE>=2));
14225   match(Set dst (NegVD dst));
14226   ins_cost(150);
14227   format %{ "xorpd $dst,[0x8000000000000000]\t# $dst = -$dst neg packed2D" %}
14228   ins_encode %{
14229     __ xorpd($dst$$XMMRegister, ExternalAddress(vector_double_signflip()));
14230   %}
14231   ins_pipe(pipe_slow);
14232 %}
14233 
14234 
14235 instruct vneg4D_reg(vecY dst, vecY src) %{
14236   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14237   match(Set dst (NegVD  src));
14238   format %{ "vxorpd $dst,$src\t# $dst = -$src neg packed4D" %}
14239   ins_cost(150);
14240   ins_encode %{
14241     int vector_len = 1;
14242     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signflip()), vector_len);
14243   %}
14244   ins_pipe( pipe_slow );
14245 %}
14246 
14247 instruct vneg8D_reg(vecZ dst, vecZ src) %{
14248   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14249   match(Set dst (NegVD  src));
14250   format %{ "vxorpd $dst,$src\t# $dst = -$src neg packed8D" %}
14251   ins_cost(150);
14252   ins_encode %{
14253     int vector_len = 2;
14254     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signflip()), vector_len);
14255   %}
14256   ins_pipe( pipe_slow );
14257 %}
14258 
14259 instruct vneg2F_reg(vecD dst) %{
14260   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
14261   match(Set dst (NegVF dst));
14262   format %{ "xorps $dst,[0x80000000]\t# $dst = -$dst neg packed2F" %}
14263   ins_cost(150);
14264   ins_encode %{
14265     __ xorps($dst$$XMMRegister, ExternalAddress(vector_float_signflip()));
14266   %}
14267   ins_pipe( pipe_slow );
14268 %}
14269 
14270 instruct vneg4F_reg(vecX dst) %{
14271   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
14272   match(Set dst (NegVF dst));
14273   format %{ "xorps $dst,[0x80000000]\t# $dst = -$dst neg packed4F" %}
14274   ins_cost(150);
14275   ins_encode %{
14276     __ xorps($dst$$XMMRegister, ExternalAddress(vector_float_signflip()));
14277   %}
14278   ins_pipe( pipe_slow );
14279 %}
14280 
14281 instruct vneg8F_reg(vecY dst, vecY src) %{
14282   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14283   match(Set dst (NegVF  src));
14284   format %{ "vxorps $dst,$src\t# $dst = -$src neg packed8F" %}
14285   ins_cost(150);
14286   ins_encode %{
14287     int vector_len = 1;
14288     __ vxorps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signflip()), vector_len);
14289   %}
14290   ins_pipe( pipe_slow );
14291 %}
14292 
14293 instruct vneg16F_reg(vecZ dst, vecZ src) %{
14294   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14295   match(Set dst (NegVF  src));
14296   format %{ "vxorps $dst,$src\t# $dst = -$src neg packed16F" %}
14297   ins_cost(150);
14298   ins_encode %{
14299     int vector_len = 2;
14300     __ vxorps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signflip()), vector_len);
14301   %}
14302   ins_pipe( pipe_slow );
14303 %}
14304 
14305 // --------------------------------- ABS --------------------------------------
14306 // a = |a|
14307 instruct vabs2I_reg(vecD dst, vecD src) %{
14308   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
14309   match(Set dst (AbsVI  src));
14310   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %}
14311   ins_cost(150);
14312   ins_encode %{
14313     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
14314   %}
14315   ins_pipe( pipe_slow );
14316 %}
14317 
14318 instruct vabs4I_reg(vecX dst, vecX src) %{
14319   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
14320   match(Set dst (AbsVI  src));
14321   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %}
14322   ins_cost(150);
14323   ins_encode %{
14324     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
14325   %}
14326   ins_pipe( pipe_slow );
14327 %}
14328 
14329 instruct vabs8I_reg(vecY dst, vecY src) %{
14330   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14331   match(Set dst (AbsVI  src));
14332   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %}
14333   ins_cost(150);
14334   ins_encode %{
14335     int vector_len = 1;
14336     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14337   %}
14338   ins_pipe( pipe_slow );
14339 %}
14340 
14341 instruct vabs16I_reg(vecZ dst, vecZ src) %{
14342   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14343   match(Set dst (AbsVI  src));
14344   format %{ "evpabsd $dst,$src\t# $dst = |$src| abs packed16I" %}
14345   ins_cost(150);
14346   ins_encode %{
14347     int vector_len = 2;
14348     __ evpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14349   %}
14350   ins_pipe( pipe_slow );
14351 %}
14352 
14353 instruct vabs1D_reg(vecD dst) %{
14354   predicate(UseSSE > 0 && n->as_Vector()->length() == 1);
14355   match(Set dst (AbsVD  dst));
14356   format %{ "andpd $dst,[0x7FFFFFFFFFFFFFFF]\t# $dst = |$dst| abs packed1D" %}
14357   ins_cost(150);
14358   ins_encode %{
14359     __ andpd($dst$$XMMRegister, ExternalAddress(vector_double_signmask()));
14360   %}
14361   ins_pipe( pipe_slow );
14362 %}
14363 
14364 instruct vabs2D_reg(vecX dst) %{
14365   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
14366   match(Set dst (AbsVD  dst));
14367   format %{ "andpd $dst,[0x7FFFFFFFFFFFFFFF]\t# $dst = |$dst| abs packed2D" %}
14368   ins_cost(150);
14369   ins_encode %{
14370     __ andpd($dst$$XMMRegister, ExternalAddress(vector_double_signmask()));
14371   %}
14372   ins_pipe( pipe_slow );
14373 %}
14374 
14375 instruct vabs4D_reg(vecY dst, vecY src) %{
14376   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14377   match(Set dst (AbsVD  src));
14378   format %{ "vandpd $dst,$src\t# $dst = |$src| abs packed4D" %}
14379   ins_cost(150);
14380   ins_encode %{
14381     int vector_len = 1;
14382     __ vandpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signmask()), vector_len);
14383   %}
14384   ins_pipe( pipe_slow );
14385 %}
14386 
14387 instruct vabs8D_reg(vecZ dst, vecZ src) %{
14388   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14389   match(Set dst (AbsVD  src));
14390   format %{ "vandpd $dst,$src\t# $dst = |$src| abs packed8D" %}
14391   ins_cost(150);
14392   ins_encode %{
14393     int vector_len = 2;
14394     __ vandpd($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_double_signmask()), vector_len);
14395   %}
14396   ins_pipe( pipe_slow );
14397 %}
14398 
14399 instruct vabs2F_reg(vecD dst) %{
14400   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
14401   match(Set dst (AbsVF  dst));
14402   format %{ "andps $dst,[0x7FFFFFFF]\t# $dst = |$dst| abs packed2F" %}
14403   ins_cost(150);
14404   ins_encode %{
14405     __ andps($dst$$XMMRegister, ExternalAddress(vector_float_signmask()));
14406   %}
14407   ins_pipe( pipe_slow );
14408 %}
14409 
14410 instruct vabs4F_reg(vecX dst) %{
14411   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
14412   match(Set dst (AbsVF  dst));
14413   format %{ "vandps $dst,[0x7FFFFFFF]\t# $dst = |$dst| abs packed4F" %}
14414   ins_cost(150);
14415   ins_encode %{
14416     __ andps($dst$$XMMRegister, ExternalAddress(vector_float_signmask()));
14417   %}
14418   ins_pipe( pipe_slow );
14419 %}
14420 
14421 instruct vabs8F_reg(vecY dst, vecY src) %{
14422   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14423   match(Set dst (AbsVF  src));
14424   format %{ "vandps $dst,$src\t# $dst = |$src| abs packed8F" %}
14425   ins_cost(150);
14426   ins_encode %{
14427     int vector_len = 1;
14428     __ vandps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signmask()), vector_len);
14429   %}
14430   ins_pipe( pipe_slow );
14431 %}
14432 
14433 instruct vabs16F_reg(vecZ dst, vecZ src) %{
14434   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
14435   match(Set dst (AbsVF  src));
14436   format %{ "vandps $dst,$src\t# $dst = |$src| abs packed16F" %}
14437   ins_cost(150);
14438   ins_encode %{
14439     int vector_len = 2;
14440     __ vandps($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_float_signmask()), vector_len);
14441   %}
14442   ins_pipe( pipe_slow );
14443 %}
14444 
14445 //------------------------------------- NOT --------------------------------------------
14446 instruct vnot4B(vecS dst, vecS src) %{
14447   predicate(UseSSE > 1 && n->as_Vector()->length_in_bytes() == 4);
14448   match(Set dst (NotV src));
14449   effect(TEMP dst);
14450   format %{ "pxor    $dst,$src\t! not vectors (4 bytes)" %}
14451   ins_encode %{
14452     __ movdl($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
14453     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
14454   %}
14455   ins_pipe( pipe_slow );
14456 %}
14457 
14458 instruct vnot4B_reg(vecS dst, vecS src, rRegL scratch) %{
14459   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
14460   match(Set dst (NotV src));
14461   effect(TEMP scratch);
14462   format %{ "vpxor   $dst,$src\t! not vectors (4 bytes)" %}
14463   ins_encode %{
14464     int vector_len = 0;
14465     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
14466   %}
14467   ins_pipe( pipe_slow );
14468 %}
14469 
14470 instruct vnot8B(vecD dst, vecD src) %{
14471   predicate(UseSSE > 1 && n->as_Vector()->length_in_bytes() == 8);
14472   match(Set dst (NotV src));
14473   effect(TEMP dst);
14474   format %{ "pxor    $dst,$src\t! not vectors (8 bytes)" %}
14475   ins_encode %{
14476     __ movq($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
14477     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
14478   %}
14479   ins_pipe( pipe_slow );
14480 %}
14481 
14482 instruct vnot8B_reg(vecD dst, vecD src, rRegL scratch) %{
14483   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
14484   match(Set dst (NotV src));
14485   effect(TEMP scratch);
14486   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (8 bytes)" %}
14487   ins_encode %{
14488     int vector_len = 0;
14489     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
14490   %}
14491   ins_pipe( pipe_slow );
14492 %}
14493 
14494 instruct vnot16B(vecX dst, vecX src) %{
14495   predicate(UseSSE > 1 && n->as_Vector()->length_in_bytes() == 16);
14496   match(Set dst (NotV src));
14497   effect(TEMP dst);
14498   format %{ "pxor    $dst,$src\t! not vectors (16 bytes)" %}
14499   ins_encode %{
14500     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_all_bits_set()));
14501     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
14502   %}
14503   ins_pipe( pipe_slow );
14504 %}
14505 
14506 instruct vnot16B_reg(vecX dst, vecX src, rRegL scratch) %{
14507   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
14508   match(Set dst (NotV src));
14509   effect(TEMP scratch);
14510   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (16 bytes)" %}
14511   ins_encode %{
14512     int vector_len = 0;
14513     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
14514   %}
14515   ins_pipe( pipe_slow );
14516 %}
14517 
14518 instruct vnot32B_reg(vecY dst, vecY src, rRegL scratch) %{
14519   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 32);
14520   match(Set dst (NotV  src));
14521   effect(TEMP scratch);
14522   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (32 bytes)" %}
14523   ins_encode %{
14524     int vector_len = 1;
14525     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
14526   %}
14527   ins_pipe( pipe_slow );
14528 %}
14529 
14530 instruct vnot64B_reg(vecZ dst, vecZ src, rRegL scratch) %{
14531   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
14532   match(Set dst (NotV src));
14533   effect(TEMP scratch);
14534   format %{ "vpxor   $dst,$src,0xFFFFFFFF \t! not vectors (64 bytes)" %}
14535   ins_encode %{
14536     int vector_len = 2;
14537     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
14538   %}
14539   ins_pipe( pipe_slow );
14540 %}
14541 
14542 instruct vptest4inae(rRegI dst, vecX src1, vecX src2) %{
14543   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::carrySet);
14544   match(Set dst (VectorTest src1 src2 ));
14545   format %{ "vptest  $src1,$src2\n\t" 
14546             "setb  $dst\t!" %}
14547   ins_encode %{
14548     int vector_len = 0;
14549     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14550     __ setb(Assembler::carrySet, $dst$$Register);
14551     __ movzbl($dst$$Register, $dst$$Register);
14552   %}
14553   ins_pipe( pipe_slow );
14554 %}
14555 
14556 instruct vptest4ieq(rRegI dst, vecX src1, vecX src2) %{
14557   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::notZero);
14558   match(Set dst (VectorTest src1 src2 ));
14559   format %{ "vptest  $src1,$src2\n\t" 
14560             "setb  $dst\t!" %}
14561   ins_encode %{
14562     int vector_len = 0;
14563     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14564     __ setb(Assembler::notZero, $dst$$Register);
14565     __ movzbl($dst$$Register, $dst$$Register);
14566   %}
14567   ins_pipe( pipe_slow );
14568 %}
14569 
14570 instruct vptest8inae(rRegI dst, vecY src1, vecY src2) %{
14571   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::carrySet);
14572   match(Set dst (VectorTest src1 src2 ));
14573   format %{ "vptest  $src1,$src2\n\t" 
14574             "setb  $dst\t!" %}
14575   ins_encode %{
14576     int vector_len = 1;
14577     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14578     __ setb(Assembler::carrySet, $dst$$Register);
14579     __ movzbl($dst$$Register, $dst$$Register);
14580   %}
14581   ins_pipe( pipe_slow );
14582 %}
14583 
14584 instruct vptest8ieq(rRegI dst, vecY src1, vecY src2) %{
14585   predicate(UseAVX > 0 && static_cast<const VectorTestNode*>(n)->get_predicate() == Assembler::notZero);
14586   match(Set dst (VectorTest src1 src2 ));
14587   format %{ "vptest  $src1,$src2\n\t" 
14588             "setb  $dst\t!" %}
14589   ins_encode %{
14590     int vector_len = 1;
14591     __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14592     __ setb(Assembler::notZero, $dst$$Register);
14593     __ movzbl($dst$$Register, $dst$$Register);
14594   %}
14595   ins_pipe( pipe_slow );
14596 %}
14597 
14598 instruct loadmask8b(vecD dst, vecD src) %{
14599   predicate(UseSSE >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14600   match(Set dst (VectorLoadMask src));
14601   effect(TEMP dst);
14602   format %{ "pxor  $dst,$dst\n\t"
14603            "psubb $dst,$src\t! load mask (8B to 8B)" %}
14604   ins_encode %{
14605     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14606     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14607   %}
14608   ins_pipe( pipe_slow );
14609 %}
14610 
14611 instruct loadmask16b(vecX dst, vecX src) %{
14612   predicate(UseSSE >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14613   match(Set dst (VectorLoadMask src));
14614   effect(TEMP dst);
14615   format %{ "vpxor  $dst,$dst\n\t"
14616            "vpsubb $dst,$src\t! load mask (16B to 16B)" %}
14617   ins_encode %{
14618     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14619     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14620   %}
14621   ins_pipe( pipe_slow );
14622 %}
14623 
14624 instruct loadmask32b(vecY dst, vecY src) %{
14625   predicate(UseAVX >= 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14626   match(Set dst (VectorLoadMask src));
14627   effect(TEMP dst);
14628   format %{ "vpxor  $dst,$dst\n\t"
14629            "vpsubb $dst,$src\t! load mask (32B to 32B)" %}
14630   ins_encode %{
14631     int vector_len = 1;
14632     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14633     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
14634   %}
14635   ins_pipe( pipe_slow );
14636 %}
14637 
14638 instruct loadmask64b(vecZ dst, vecZ src) %{
14639   predicate(UseAVX > 0 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14640   match(Set dst (VectorLoadMask src));
14641   effect(TEMP dst);
14642   format %{ "vpxor  $dst,$dst\n\t"
14643            "vpsubb $dst,$src\t! load mask (64B to 64B)" %}
14644   ins_encode %{
14645     int vector_len = 2;
14646     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14647     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
14648   %}
14649   ins_pipe( pipe_slow );
14650 %}
14651 
14652 instruct loadmask4s(vecD dst, vecS src) %{
14653   predicate(UseSSE >= 4 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14654   match(Set dst (VectorLoadMask src));
14655   effect(TEMP dst);
14656   format %{ "pxor  $dst,$dst\n\t"
14657            "psubb $dst,$src\n\t"
14658            "pmovsxbw $dst\t! load mask (4B to 4S)" %}
14659   ins_encode %{
14660     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14661     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14662     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
14663   %}
14664   ins_pipe( pipe_slow );
14665 %}
14666 
14667 instruct loadmask8s(vecX dst, vecD src) %{
14668   predicate(UseSSE >= 4 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14669   match(Set dst (VectorLoadMask src));
14670   effect(TEMP dst);
14671   format %{ "pxor  $dst,$dst\n\t"
14672            "psubb $dst,$src\n\t"
14673            "pmovsxbw $dst\t! load mask (8B to 8S)" %}
14674   ins_encode %{
14675     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14676     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14677     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
14678   %}
14679   ins_pipe( pipe_slow );
14680 %}
14681 
14682 instruct loadmask16s(vecY dst, vecX src) %{
14683   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14684   match(Set dst (VectorLoadMask src));
14685   effect(TEMP dst);
14686   format %{ "vpxor  $dst,$dst\n\t"
14687            "vpsubb $dst,$src\n\t"
14688            "vpmovsxbw $dst\t! load mask (16B to 16S)" %}
14689   ins_encode %{
14690     int vector_len = 1;
14691     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
14692     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 0);
14693     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14694   %}
14695   ins_pipe( pipe_slow );
14696 %}
14697 
14698 instruct loadmask32s(vecZ dst, vecY src) %{
14699   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14700   match(Set dst (VectorLoadMask src));
14701   effect(TEMP dst);
14702   format %{ "vpxor  $dst,$dst\n\t"
14703            "vpsubb $dst,$src\n\t"
14704            "vpmovsxbw $dst\t! load mask (32B to 32S)" %}
14705   ins_encode %{
14706     int vector_len = 2;
14707     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 1);
14708     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 1);
14709     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14710   %}
14711   ins_pipe( pipe_slow );
14712 %}
14713 
14714 instruct loadmask2i(vecD dst, vecS src) %{
14715   predicate(UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14716   match(Set dst (VectorLoadMask src));
14717   effect(TEMP dst);
14718   format %{ "pxor  $dst,$dst\n\t"
14719            "psubb $dst,$src\n\t"
14720            "pmovsxbd $dst\t! load mask (2B to 2I)" %}
14721   ins_encode %{
14722     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14723     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14724     __ pmovsxbd($dst$$XMMRegister, $dst$$XMMRegister);
14725   %}
14726   ins_pipe( pipe_slow );
14727 %}
14728 
14729 instruct loadmask4i(vecX dst, vecS src) %{
14730   predicate(UseSSE >= 4 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14731   match(Set dst (VectorLoadMask src));
14732   effect(TEMP dst);
14733   format %{ "pxor  $dst,$dst\n\t"
14734            "psubb $dst,$src\n\t"
14735            "pmovsxbd $dst\t! load mask (4B to 4I)" %}
14736   ins_encode %{
14737     int vector_len = 0;
14738     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14739     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14740     __ pmovsxbd($dst$$XMMRegister, $dst$$XMMRegister);
14741   %}
14742   ins_pipe( pipe_slow );
14743 %}
14744 
14745 instruct loadmask8i(vecY dst, vecD src) %{
14746   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14747   match(Set dst (VectorLoadMask src));
14748   effect(TEMP dst);
14749   format %{ "vpxor  $dst,$dst\n\t"
14750            "vpsubb $dst,$src\n\t"
14751            "vpmovsxbd $dst\t! load mask (8B to 8I)" %}
14752   ins_encode %{
14753     int vector_len = 1;
14754     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
14755     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 0);
14756     __ vpmovsxbd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14757   %}
14758   ins_pipe( pipe_slow );
14759 %}
14760 
14761 instruct loadmask16i(vecZ dst, vecX src, vecZ tmp) %{
14762   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14763   match(Set dst (VectorLoadMask src));
14764   effect(TEMP dst, TEMP tmp);
14765   format %{ "vpxor  $dst,$dst\n\t"
14766            "vpmovzxbd $tmp,$src\n\t"
14767            "vpsubd $dst,$tmp\t! load mask (16B to 16I)" %}
14768   ins_encode %{
14769     int vector_len = 2;
14770     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14771     __ vpmovzxbd($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
14772     __ vpsubd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
14773   %}
14774   ins_pipe( pipe_slow );
14775 %}
14776 
14777 instruct loadmask1l(vecD dst, vecS src) %{
14778   predicate(UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14779   match(Set dst (VectorLoadMask src));
14780   effect(TEMP dst);
14781   format %{ "pxor  $dst,$dst\n\t"
14782            "psubb $dst,$src\n\t"
14783            "pmovsxbq $dst\t! load mask (1B to 1L)" %}
14784   ins_encode %{
14785     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14786     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14787     __ pmovsxbq($dst$$XMMRegister, $dst$$XMMRegister);
14788   %}
14789   ins_pipe( pipe_slow );
14790 %}
14791 
14792 instruct loadmask2l(vecX dst, vecS src) %{
14793   predicate(UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14794   match(Set dst (VectorLoadMask src));
14795   effect(TEMP dst);
14796   format %{ "pxor  $dst,$dst\n\t"
14797            "psubb $dst,$src\n\t"
14798            "pmovsxbq $dst\t! load mask (2B to 2L)" %}
14799   ins_encode %{
14800     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
14801     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
14802     __ pmovsxbq($dst$$XMMRegister, $dst$$XMMRegister);
14803   %}
14804   ins_pipe( pipe_slow );
14805 %}
14806 
14807 instruct loadmask4l(vecY dst, vecS src) %{
14808   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14809   match(Set dst (VectorLoadMask src));
14810   effect(TEMP dst);
14811   format %{ "vpxor  $dst,$dst\n\t"
14812            "vpsubb $dst,$src\n\t"
14813            "vpmovsxbq $dst\t! load mask (4B to 4L)" %}
14814   ins_encode %{
14815     int vector_len = 1;
14816     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
14817     __ vpsubb($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, 0);
14818     __ vpmovsxbq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14819   %}
14820   ins_pipe( pipe_slow );
14821 %}
14822 
14823 instruct loadmask8l(vecZ dst, vecD src, vecZ tmp) %{
14824   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14825   match(Set dst (VectorLoadMask src));
14826   effect(TEMP dst, TEMP tmp);
14827   format %{ "vpxor  $dst,$dst\n\t"
14828            "vpmovzxbq $tmp,$src\n\t"
14829            "vpsubq $dst,$tmp\t! load mask (8B to 8L)" %}
14830   ins_encode %{
14831     int vector_len = 2;
14832     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14833     __ vpmovzxbq($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
14834     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
14835   %}
14836   ins_pipe( pipe_slow );
14837 %}
14838 
14839 instruct storemask8b(vecD dst, vecD src) %{
14840   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
14841   match(Set dst (VectorStoreMask src));
14842   format %{ "vpabsb $dst,$src\t! store mask (8B to 8B)" %}
14843   ins_encode %{
14844     int vector_len = 0;
14845     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14846   %}
14847   ins_pipe( pipe_slow );
14848 %}
14849 
14850 instruct storemask16b(vecX dst, vecX src) %{
14851   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
14852   match(Set dst (VectorStoreMask src));
14853   format %{ "vpabsb $dst,$src\t! store mask (16B to 16B)" %}
14854   ins_encode %{
14855     int vector_len = 0;
14856     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14857   %}
14858   ins_pipe( pipe_slow );
14859 %}
14860 
14861 instruct storemask32b(vecY dst, vecY src) %{
14862   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
14863   match(Set dst (VectorStoreMask src));
14864   format %{ "vpabsb $dst,$src\t! store mask (32B to 32B)" %}
14865   ins_encode %{
14866     int vector_len = 1;
14867     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14868   %}
14869   ins_pipe( pipe_slow );
14870 %}
14871 
14872 instruct storemask64b(vecZ dst, vecZ src, rRegL scratch) %{
14873   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 1);
14874   match(Set dst (VectorStoreMask src));
14875   effect(TEMP scratch);
14876   format %{ "vpcmpeqb k2,$src,0xFFFFFFFF\n\t"
14877            "vmovdqub $dst,k2,0x01010101\t! store mask (64B to 64B)" %}
14878   ins_encode %{
14879     int vector_len = 2;
14880     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
14881     Assembler::ComparisonPredicate cp = Assembler::eq;
14882     __ evpcmpb(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
14883     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), true, vector_len, $scratch$$Register);
14884   %}
14885   ins_pipe( pipe_slow );
14886 %}
14887 
14888 instruct storemask4s(vecS dst, vecD src) %{
14889   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
14890   match(Set dst (VectorStoreMask src));
14891   format %{ "vpabsw $dst,$src\n\t"
14892            "vpackuswb $dst,$dst,$dst\t! store mask (4S to 4B)" %}
14893   ins_encode %{
14894     int vector_len = 0;
14895     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14896     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14897   %}
14898   ins_pipe( pipe_slow );
14899 %}
14900 
14901 instruct storemask8s(vecD dst, vecX src) %{
14902   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
14903   match(Set dst (VectorStoreMask src));
14904   format %{ "vpabsw $dst,$src\n\t"
14905            "vpackuswb $dst,$dst,$dst\t! store mask (8S to 8B)" %}
14906   ins_encode %{
14907     int vector_len = 0;
14908     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14909     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14910   %}
14911   ins_pipe( pipe_slow );
14912 %}
14913 
14914 instruct storemask16s(vecX dst, vecY src, vecY tmp) %{
14915   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
14916   match(Set dst (VectorStoreMask src));
14917   effect(TEMP dst, TEMP tmp);
14918   format %{ "vpabsw $dst,$src\n\t"
14919            "vextracti128 $tmp,$dst\n\t"
14920            "vpackuswb $dst,$dst,$tmp\t! store mask (16S to 16B)" %}
14921   ins_encode %{
14922     int vector_len = 1;
14923     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14924     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
14925     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
14926   %}
14927   ins_pipe( pipe_slow );
14928 %}
14929 
14930 instruct storemask32s(vecY dst, vecZ src, rRegL scratch) %{
14931   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 2);
14932   match(Set dst (VectorStoreMask src));
14933   effect(TEMP scratch);
14934   format %{ "vpcmpeqw k2,$src,0xFFFFFFFF\n\t"
14935            "vmovdqub $dst,k2,0x01010101\t! store mask (32S to 32B)" %}
14936   ins_encode %{
14937     int vector_len = 2;
14938     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
14939     Assembler::ComparisonPredicate cp = Assembler::eq;
14940     __ evpcmpw(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
14941     // The dst is 256-bit - thus we can do a smaller move.
14942     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), true, 1, $scratch$$Register);
14943   %}
14944   ins_pipe( pipe_slow );
14945 %}
14946 
14947 
14948 instruct storemask2i(vecS dst, vecD src) %{
14949   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
14950   match(Set dst (VectorStoreMask src));
14951   format %{ "vpabsd $dst,$src\n\t"
14952            "vpackusdw $dst,$dst,$dst\n\t"
14953            "vpackuswb $dst,$dst,$dst\t! store mask (2I to 2B)" %}
14954   ins_encode %{
14955     int vector_len = 0;
14956     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14957     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14958     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14959   %}
14960   ins_pipe( pipe_slow );
14961 %}
14962 
14963 instruct storemask4i(vecS dst, vecX src) %{
14964   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
14965   match(Set dst (VectorStoreMask src));
14966   format %{ "vpabsd $dst,$src\n\t"
14967            "vpackusdw $dst,$dst,$dst\n\t"
14968            "vpackuswb $dst,$dst,$dst\t! store mask (4I to 4B)" %}
14969   ins_encode %{
14970     int vector_len = 0;
14971     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
14972     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14973     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14974   %}
14975   ins_pipe( pipe_slow );
14976 %}
14977 
14978 instruct storemask8i(vecD dst, vecY src, vecY tmp) %{
14979   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
14980   match(Set dst (VectorStoreMask src));
14981   effect(TEMP dst, TEMP tmp);
14982   format %{ "vpxor  $dst,$dst\n\t"
14983            "vpsubd $dst,$src\n\t"
14984            "vextracti128 $tmp,$dst\n\t"
14985            "vpackusdw $dst,$dst,$tmp\n\t"
14986            "vpackuswb $dst,$dst,$dst\t! store mask (8I to 8B)" %}
14987   ins_encode %{
14988     int vector_len = 1;
14989     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
14990     __ vpsubd($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
14991     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
14992     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
14993     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
14994   %}
14995   ins_pipe( pipe_slow );
14996 %}
14997 
14998 instruct storemask16i(vecX dst, vecZ src, rRegL scratch) %{
14999   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 4);
15000   match(Set dst (VectorStoreMask src));
15001   effect(TEMP scratch);
15002   format %{ "vpcmpeqd k2,$src,0xFFFFFFFF\n\t"
15003            "vmovdqub $dst,k2,0x01010101\t! store mask (16I to 16B)" %}
15004   ins_encode %{
15005     int vector_len = 2;
15006     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
15007     __ evpcmpeqd(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
15008     // The dst is only 128-bit - thus we can do a smaller move.
15009     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), true, 0, $scratch$$Register);
15010   %}
15011   ins_pipe( pipe_slow );
15012 %}
15013 
15014 instruct storemask1l(vecS dst, vecD src) %{
15015   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
15016   match(Set dst (VectorStoreMask src));
15017   format %{ "vpabsd $dst,$src\n\t"
15018            "vpackusdw $dst,$dst,$dst\n\t"
15019            "vpackuswb $dst,$dst,$dst\t! store mask (1L to 1B)" %}
15020   ins_encode %{
15021     int vector_len = 0;
15022     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15023     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15024     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15025   %}
15026   ins_pipe( pipe_slow );
15027 %}
15028 
15029 instruct storemask2l(vecS dst, vecX src) %{
15030   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
15031   match(Set dst (VectorStoreMask src));
15032   format %{ "vpshufd $dst,$src,0x8\n\t"
15033            "vpabsd $dst,$dst\n\t"
15034            "vpackusdw $dst,$dst,$dst\n\t"
15035            "vpackuswb $dst,$dst,$dst\t! store mask (2L to 2B)" %}
15036   ins_encode %{
15037     int vector_len = 0;
15038     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8, vector_len);
15039     __ vpabsd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15040     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15041     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15042   %}
15043   ins_pipe( pipe_slow );
15044 %}
15045 
15046 instruct storemask4l(vecS dst, vecY src, rRegL scratch) %{
15047   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
15048   match(Set dst (VectorStoreMask src));
15049   effect(TEMP scratch, TEMP dst);
15050   format %{ "vmovdqu $dst,[0,2,4,6,1,3,5,7]\n\t"
15051            "vpermd $dst,$dst,$src,"
15052            "vpabsd $dst,$dst\n\t"
15053            "vpackusdw $dst,$dst,$dst\n\t"
15054            "vpackuswb $dst,$dst,$dst\t! store mask (4L to 4B)" %}
15055   ins_encode %{
15056     // vpermd and load are 256-bit, but all others are 128-bit instructions.
15057     int vector_len = 0;
15058     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_long_perm_mask()), $scratch$$Register);
15059     __ vpermd($dst$$XMMRegister, $dst$$XMMRegister, $src$$XMMRegister);
15060     __ vpabsd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15061     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15062     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
15063   %}
15064   ins_pipe( pipe_slow );
15065 %}
15066 
15067 instruct storemask8l(vecD dst, vecZ src, rRegL scratch) %{
15068   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
15069   match(Set dst (VectorStoreMask src));
15070   effect(TEMP scratch);
15071   format %{ "vpcmpeqq k2,$src,0xFFFFFFFF\n\t"
15072            "vmovdqub $dst,k2,0x01010101\t! store mask (8L to 8B)" %}
15073   ins_encode %{
15074     int vector_len = 2;
15075     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
15076     Assembler::ComparisonPredicate cp = Assembler::eq;
15077     __ evpcmpq(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
15078     // The dst is only 128-bit - thus we can do a smaller move.
15079     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), true, 0, $scratch$$Register);
15080   %}
15081   ins_pipe( pipe_slow );
15082 %}
15083 
15084 // --------------------------------- FMA --------------------------------------
15085 
15086 // a * b + c
15087 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
15088   predicate(UseFMA && n->as_Vector()->length() == 2);
15089   match(Set c (FmaVD  c (Binary a b)));
15090   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
15091   ins_cost(150);
15092   ins_encode %{
15093     int vector_len = 0;
15094     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
15095   %}
15096   ins_pipe( pipe_slow );
15097 %}
15098 
15099 // a * b + c
15100 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
15101   predicate(UseFMA && n->as_Vector()->length() == 2);
15102   match(Set c (FmaVD  c (Binary a (LoadVector b))));
15103   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
15104   ins_cost(150);
15105   ins_encode %{
15106     int vector_len = 0;
15107     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
15108   %}
15109   ins_pipe( pipe_slow );
15110 %}
15111 
15112 
15113 // a * b + c
15114 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
15115   predicate(UseFMA && n->as_Vector()->length() == 4);
15116   match(Set c (FmaVD  c (Binary a b)));
15117   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
15118   ins_cost(150);
15119   ins_encode %{
15120     int vector_len = 1;
15121     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
15122   %}
15123   ins_pipe( pipe_slow );
15124 %}
15125 
15126 // a * b + c
15127 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
15128   predicate(UseFMA && n->as_Vector()->length() == 4);
15129   match(Set c (FmaVD  c (Binary a (LoadVector b))));
15130   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
15131   ins_cost(150);
15132   ins_encode %{
15133     int vector_len = 1;
15134     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
15135   %}
15136   ins_pipe( pipe_slow );
15137 %}
15138 
15139 // a * b + c
15140 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
15141   predicate(UseFMA && n->as_Vector()->length() == 8);
15142   match(Set c (FmaVD  c (Binary a b)));
15143   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
15144   ins_cost(150);
15145   ins_encode %{
15146     int vector_len = 2;
15147     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
15148   %}
15149   ins_pipe( pipe_slow );
15150 %}
15151 
15152 // a * b + c
15153 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
15154   predicate(UseFMA && n->as_Vector()->length() == 8);
15155   match(Set c (FmaVD  c (Binary a (LoadVector b))));
15156   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
15157   ins_cost(150);
15158   ins_encode %{
15159     int vector_len = 2;
15160     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
15161   %}
15162   ins_pipe( pipe_slow );
15163 %}
15164 
15165 // a * b + c
15166 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
15167   predicate(UseFMA && n->as_Vector()->length() == 4);
15168   match(Set c (FmaVF  c (Binary a b)));
15169   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
15170   ins_cost(150);
15171   ins_encode %{
15172     int vector_len = 0;
15173     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
15174   %}
15175   ins_pipe( pipe_slow );
15176 %}
15177 
15178 // a * b + c
15179 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
15180   predicate(UseFMA && n->as_Vector()->length() == 4);
15181   match(Set c (FmaVF  c (Binary a (LoadVector b))));
15182   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
15183   ins_cost(150);
15184   ins_encode %{
15185     int vector_len = 0;
15186     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
15187   %}
15188   ins_pipe( pipe_slow );
15189 %}
15190 
15191 // a * b + c
15192 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
15193   predicate(UseFMA && n->as_Vector()->length() == 8);
15194   match(Set c (FmaVF  c (Binary a b)));
15195   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
15196   ins_cost(150);
15197   ins_encode %{
15198     int vector_len = 1;
15199     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
15200   %}
15201   ins_pipe( pipe_slow );
15202 %}
15203 
15204 // a * b + c
15205 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
15206   predicate(UseFMA && n->as_Vector()->length() == 8);
15207   match(Set c (FmaVF  c (Binary a (LoadVector b))));
15208   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
15209   ins_cost(150);
15210   ins_encode %{
15211     int vector_len = 1;
15212     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
15213   %}
15214   ins_pipe( pipe_slow );
15215 %}
15216 
15217 // a * b + c
15218 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
15219   predicate(UseFMA && n->as_Vector()->length() == 16);
15220   match(Set c (FmaVF  c (Binary a b)));
15221   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
15222   ins_cost(150);
15223   ins_encode %{
15224     int vector_len = 2;
15225     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
15226   %}
15227   ins_pipe( pipe_slow );
15228 %}
15229 
15230 // a * b + c
15231 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
15232   predicate(UseFMA && n->as_Vector()->length() == 16);
15233   match(Set c (FmaVF  c (Binary a (LoadVector b))));
15234   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
15235   ins_cost(150);
15236   ins_encode %{
15237     int vector_len = 2;
15238     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
15239   %}
15240   ins_pipe( pipe_slow );
15241 %}