1 //
   2 // Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 %}
1071 
1072 
1073 //----------SOURCE BLOCK-------------------------------------------------------
1074 // This is a block of C++ code which provides values, functions, and
1075 // definitions necessary in the rest of the architecture description
1076 
1077 source_hpp %{
1078 // Header information of the source block.
1079 // Method declarations/definitions which are used outside
1080 // the ad-scope can conveniently be defined here.
1081 //
1082 // To keep related declarations/definitions/uses close together,
1083 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1084 
1085 class NativeJump;
1086 
1087 class CallStubImpl {
1088 
1089   //--------------------------------------------------------------
1090   //---<  Used for optimization in Compile::shorten_branches  >---
1091   //--------------------------------------------------------------
1092 
1093  public:
1094   // Size of call trampoline stub.
1095   static uint size_call_trampoline() {
1096     return 0; // no call trampolines on this platform
1097   }
1098 
1099   // number of relocations needed by a call trampoline stub
1100   static uint reloc_call_trampoline() {
1101     return 0; // no call trampolines on this platform
1102   }
1103 };
1104 
1105 class HandlerImpl {
1106 
1107  public:
1108 
1109   static int emit_exception_handler(CodeBuffer &cbuf);
1110   static int emit_deopt_handler(CodeBuffer& cbuf);
1111 
1112   static uint size_exception_handler() {
1113     // NativeCall instruction size is the same as NativeJump.
1114     // exception handler starts out as jump and can be patched to
1115     // a call be deoptimization.  (4932387)
1116     // Note that this value is also credited (in output.cpp) to
1117     // the size of the code section.
1118     return NativeJump::instruction_size;
1119   }
1120 
1121 #ifdef _LP64
1122   static uint size_deopt_handler() {
1123     // three 5 byte instructions
1124     return 15;
1125   }
1126 #else
1127   static uint size_deopt_handler() {
1128     // NativeCall instruction size is the same as NativeJump.
1129     // exception handler starts out as jump and can be patched to
1130     // a call be deoptimization.  (4932387)
1131     // Note that this value is also credited (in output.cpp) to
1132     // the size of the code section.
1133     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1134   }
1135 #endif
1136 };
1137 
1138 %} // end source_hpp
1139 
1140 source %{
1141 
1142 #include "opto/addnode.hpp"
1143 
1144 // Emit exception handler code.
1145 // Stuff framesize into a register and call a VM stub routine.
1146 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1147 
1148   // Note that the code buffer's insts_mark is always relative to insts.
1149   // That's why we must use the macroassembler to generate a handler.
1150   MacroAssembler _masm(&cbuf);
1151   address base = __ start_a_stub(size_exception_handler());
1152   if (base == NULL) {
1153     ciEnv::current()->record_failure("CodeCache is full");
1154     return 0;  // CodeBuffer::expand failed
1155   }
1156   int offset = __ offset();
1157   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1158   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1159   __ end_a_stub();
1160   return offset;
1161 }
1162 
1163 // Emit deopt handler code.
1164 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1165 
1166   // Note that the code buffer's insts_mark is always relative to insts.
1167   // That's why we must use the macroassembler to generate a handler.
1168   MacroAssembler _masm(&cbuf);
1169   address base = __ start_a_stub(size_deopt_handler());
1170   if (base == NULL) {
1171     ciEnv::current()->record_failure("CodeCache is full");
1172     return 0;  // CodeBuffer::expand failed
1173   }
1174   int offset = __ offset();
1175 
1176 #ifdef _LP64
1177   address the_pc = (address) __ pc();
1178   Label next;
1179   // push a "the_pc" on the stack without destroying any registers
1180   // as they all may be live.
1181 
1182   // push address of "next"
1183   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1184   __ bind(next);
1185   // adjust it so it matches "the_pc"
1186   __ subptr(Address(rsp, 0), __ offset() - offset);
1187 #else
1188   InternalAddress here(__ pc());
1189   __ pushptr(here.addr());
1190 #endif
1191 
1192   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1193   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1194   __ end_a_stub();
1195   return offset;
1196 }
1197 
1198 
1199 //=============================================================================
1200 
1201   // Float masks come from different places depending on platform.
1202 #ifdef _LP64
1203   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1204   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1205   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1206   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1207 #else
1208   static address float_signmask()  { return (address)float_signmask_pool; }
1209   static address float_signflip()  { return (address)float_signflip_pool; }
1210   static address double_signmask() { return (address)double_signmask_pool; }
1211   static address double_signflip() { return (address)double_signflip_pool; }
1212 #endif
1213 
1214 
1215 const bool Matcher::match_rule_supported(int opcode) {
1216   if (!has_match_rule(opcode))
1217     return false;
1218 
1219   bool ret_value = true;
1220   switch (opcode) {
1221     case Op_PopCountI:
1222     case Op_PopCountL:
1223       if (!UsePopCountInstruction)
1224         ret_value = false;
1225       break;
1226     case Op_MulVI:
1227       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1228         ret_value = false;
1229       break;
1230     case Op_MulVL:
1231     case Op_MulReductionVL:
1232       if (VM_Version::supports_avx512dq() == false)
1233         ret_value = false;
1234       break;
1235     case Op_AddReductionVL:
1236       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1237         ret_value = false;
1238       break;
1239     case Op_AddReductionVI:
1240       if (UseSSE < 3) // requires at least SSE3
1241         ret_value = false;
1242       break;
1243     case Op_MulReductionVI:
1244       if (UseSSE < 4) // requires at least SSE4
1245         ret_value = false;
1246       break;
1247     case Op_AddReductionVF:
1248     case Op_AddReductionVD:
1249     case Op_MulReductionVF:
1250     case Op_MulReductionVD:
1251       if (UseSSE < 1) // requires at least SSE
1252         ret_value = false;
1253       break;
1254     case Op_SqrtVD:
1255     case Op_SqrtVF:
1256       if (UseAVX < 1) // enabled for AVX only
1257         ret_value = false;
1258       break;
1259     case Op_CompareAndSwapL:
1260 #ifdef _LP64
1261     case Op_CompareAndSwapP:
1262 #endif
1263       if (!VM_Version::supports_cx8())
1264         ret_value = false;
1265       break;
1266     case Op_CMoveVF:
1267     case Op_CMoveVD:
1268       if (UseAVX < 1 || UseAVX > 2)
1269         ret_value = false;
1270       break;
1271     case Op_StrIndexOf:
1272       if (!UseSSE42Intrinsics)
1273         ret_value = false;
1274       break;
1275     case Op_StrIndexOfChar:
1276       if (!UseSSE42Intrinsics)
1277         ret_value = false;
1278       break;
1279     case Op_OnSpinWait:
1280       if (VM_Version::supports_on_spin_wait() == false)
1281         ret_value = false;
1282       break;
1283   }
1284 
1285   return ret_value;  // Per default match rules are supported.
1286 }
1287 
1288 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1289   // identify extra cases that we might want to provide match rules for
1290   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1291   bool ret_value = match_rule_supported(opcode);
1292   if (ret_value) {
1293     switch (opcode) {
1294       case Op_AddVB:
1295       case Op_SubVB:
1296         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1297           ret_value = false;
1298         break;
1299       case Op_URShiftVS:
1300       case Op_RShiftVS:
1301       case Op_LShiftVS:
1302       case Op_MulVS:
1303       case Op_AddVS:
1304       case Op_SubVS:
1305         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1306           ret_value = false;
1307         break;
1308       case Op_CMoveVF:
1309         if (vlen != 8)
1310           ret_value  = false;
1311       case Op_CMoveVD:
1312         if (vlen != 4)
1313           ret_value  = false;
1314         break;
1315     }
1316   }
1317 
1318   return ret_value;  // Per default match rules are supported.
1319 }
1320 
1321 const bool Matcher::has_predicated_vectors(void) {
1322   bool ret_value = false;
1323   if (UseAVX > 2) {
1324     ret_value = VM_Version::supports_avx512vl();
1325   }
1326 
1327   return ret_value;
1328 }
1329 
1330 const int Matcher::float_pressure(int default_pressure_threshold) {
1331   int float_pressure_threshold = default_pressure_threshold;
1332 #ifdef _LP64
1333   if (UseAVX > 2) {
1334     // Increase pressure threshold on machines with AVX3 which have
1335     // 2x more XMM registers.
1336     float_pressure_threshold = default_pressure_threshold * 2;
1337   }
1338 #endif
1339   return float_pressure_threshold;
1340 }
1341 
1342 // Max vector size in bytes. 0 if not supported.
1343 const int Matcher::vector_width_in_bytes(BasicType bt) {
1344   assert(is_java_primitive(bt), "only primitive type vectors");
1345   if (UseSSE < 2) return 0;
1346   // SSE2 supports 128bit vectors for all types.
1347   // AVX2 supports 256bit vectors for all types.
1348   // AVX2/EVEX supports 512bit vectors for all types.
1349   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1350   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1351   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1352     size = (UseAVX > 2) ? 64 : 32;
1353   // Use flag to limit vector size.
1354   size = MIN2(size,(int)MaxVectorSize);
1355   // Minimum 2 values in vector (or 4 for bytes).
1356   switch (bt) {
1357   case T_DOUBLE:
1358   case T_LONG:
1359     if (size < 16) return 0;
1360     break;
1361   case T_FLOAT:
1362   case T_INT:
1363     if (size < 8) return 0;
1364     break;
1365   case T_BOOLEAN:
1366     if (size < 4) return 0;
1367     break;
1368   case T_CHAR:
1369     if (size < 4) return 0;
1370     break;
1371   case T_BYTE:
1372     if (size < 4) return 0;
1373     break;
1374   case T_SHORT:
1375     if (size < 4) return 0;
1376     break;
1377   default:
1378     ShouldNotReachHere();
1379   }
1380   return size;
1381 }
1382 
1383 // Limits on vector size (number of elements) loaded into vector.
1384 const int Matcher::max_vector_size(const BasicType bt) {
1385   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1386 }
1387 const int Matcher::min_vector_size(const BasicType bt) {
1388   int max_size = max_vector_size(bt);
1389   // Min size which can be loaded into vector is 4 bytes.
1390   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1391   return MIN2(size,max_size);
1392 }
1393 
1394 // Vector ideal reg corresponding to specidied size in bytes
1395 const uint Matcher::vector_ideal_reg(int size) {
1396   assert(MaxVectorSize >= size, "");
1397   switch(size) {
1398     case  4: return Op_VecS;
1399     case  8: return Op_VecD;
1400     case 16: return Op_VecX;
1401     case 32: return Op_VecY;
1402     case 64: return Op_VecZ;
1403   }
1404   ShouldNotReachHere();
1405   return 0;
1406 }
1407 
1408 // Only lowest bits of xmm reg are used for vector shift count.
1409 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1410   return Op_VecS;
1411 }
1412 
1413 // x86 supports misaligned vectors store/load.
1414 const bool Matcher::misaligned_vectors_ok() {
1415   return !AlignVector; // can be changed by flag
1416 }
1417 
1418 // x86 AES instructions are compatible with SunJCE expanded
1419 // keys, hence we do not need to pass the original key to stubs
1420 const bool Matcher::pass_original_key_for_aes() {
1421   return false;
1422 }
1423 
1424 
1425 const bool Matcher::convi2l_type_required = true;
1426 
1427 // Check for shift by small constant as well
1428 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1429   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1430       shift->in(2)->get_int() <= 3 &&
1431       // Are there other uses besides address expressions?
1432       !matcher->is_visited(shift)) {
1433     address_visited.set(shift->_idx); // Flag as address_visited
1434     mstack.push(shift->in(2), Matcher::Visit);
1435     Node *conv = shift->in(1);
1436 #ifdef _LP64
1437     // Allow Matcher to match the rule which bypass
1438     // ConvI2L operation for an array index on LP64
1439     // if the index value is positive.
1440     if (conv->Opcode() == Op_ConvI2L &&
1441         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1442         // Are there other uses besides address expressions?
1443         !matcher->is_visited(conv)) {
1444       address_visited.set(conv->_idx); // Flag as address_visited
1445       mstack.push(conv->in(1), Matcher::Pre_Visit);
1446     } else
1447 #endif
1448       mstack.push(conv, Matcher::Pre_Visit);
1449     return true;
1450   }
1451   return false;
1452 }
1453 
1454 // Should the Matcher clone shifts on addressing modes, expecting them
1455 // to be subsumed into complex addressing expressions or compute them
1456 // into registers?
1457 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1458   Node *off = m->in(AddPNode::Offset);
1459   if (off->is_Con()) {
1460     address_visited.test_set(m->_idx); // Flag as address_visited
1461     Node *adr = m->in(AddPNode::Address);
1462 
1463     // Intel can handle 2 adds in addressing mode
1464     // AtomicAdd is not an addressing expression.
1465     // Cheap to find it by looking for screwy base.
1466     if (adr->is_AddP() &&
1467         !adr->in(AddPNode::Base)->is_top() &&
1468         // Are there other uses besides address expressions?
1469         !is_visited(adr)) {
1470       address_visited.set(adr->_idx); // Flag as address_visited
1471       Node *shift = adr->in(AddPNode::Offset);
1472       if (!clone_shift(shift, this, mstack, address_visited)) {
1473         mstack.push(shift, Pre_Visit);
1474       }
1475       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1476       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1477     } else {
1478       mstack.push(adr, Pre_Visit);
1479     }
1480 
1481     // Clone X+offset as it also folds into most addressing expressions
1482     mstack.push(off, Visit);
1483     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1484     return true;
1485   } else if (clone_shift(off, this, mstack, address_visited)) {
1486     address_visited.test_set(m->_idx); // Flag as address_visited
1487     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1488     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1489     return true;
1490   }
1491   return false;
1492 }
1493 
1494 void Compile::reshape_address(AddPNode* addp) {
1495 }
1496 
1497 // Helper methods for MachSpillCopyNode::implementation().
1498 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1499                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1500   // In 64-bit VM size calculation is very complex. Emitting instructions
1501   // into scratch buffer is used to get size in 64-bit VM.
1502   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1503   assert(ireg == Op_VecS || // 32bit vector
1504          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1505          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1506          "no non-adjacent vector moves" );
1507   if (cbuf) {
1508     MacroAssembler _masm(cbuf);
1509     int offset = __ offset();
1510     switch (ireg) {
1511     case Op_VecS: // copy whole register
1512     case Op_VecD:
1513     case Op_VecX:
1514       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1515       break;
1516     case Op_VecY:
1517       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1518       break;
1519     case Op_VecZ:
1520       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1521       break;
1522     default:
1523       ShouldNotReachHere();
1524     }
1525     int size = __ offset() - offset;
1526 #ifdef ASSERT
1527     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1528     assert(!do_size || size == 4, "incorrect size calculattion");
1529 #endif
1530     return size;
1531 #ifndef PRODUCT
1532   } else if (!do_size) {
1533     switch (ireg) {
1534     case Op_VecS:
1535     case Op_VecD:
1536     case Op_VecX:
1537       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1538       break;
1539     case Op_VecY:
1540     case Op_VecZ:
1541       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1542       break;
1543     default:
1544       ShouldNotReachHere();
1545     }
1546 #endif
1547   }
1548   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1549   return (UseAVX > 2) ? 6 : 4;
1550 }
1551 
1552 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1553                             int stack_offset, int reg, uint ireg, outputStream* st) {
1554   // In 64-bit VM size calculation is very complex. Emitting instructions
1555   // into scratch buffer is used to get size in 64-bit VM.
1556   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1557   if (cbuf) {
1558     MacroAssembler _masm(cbuf);
1559     int offset = __ offset();
1560     if (is_load) {
1561       switch (ireg) {
1562       case Op_VecS:
1563         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1564         break;
1565       case Op_VecD:
1566         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1567         break;
1568       case Op_VecX:
1569         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1570         break;
1571       case Op_VecY:
1572         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1573         break;
1574       case Op_VecZ:
1575         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1576         break;
1577       default:
1578         ShouldNotReachHere();
1579       }
1580     } else { // store
1581       switch (ireg) {
1582       case Op_VecS:
1583         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1584         break;
1585       case Op_VecD:
1586         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1587         break;
1588       case Op_VecX:
1589         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1590         break;
1591       case Op_VecY:
1592         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1593         break;
1594       case Op_VecZ:
1595         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1596         break;
1597       default:
1598         ShouldNotReachHere();
1599       }
1600     }
1601     int size = __ offset() - offset;
1602 #ifdef ASSERT
1603     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1604     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1605     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1606 #endif
1607     return size;
1608 #ifndef PRODUCT
1609   } else if (!do_size) {
1610     if (is_load) {
1611       switch (ireg) {
1612       case Op_VecS:
1613         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1614         break;
1615       case Op_VecD:
1616         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1617         break;
1618        case Op_VecX:
1619         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1620         break;
1621       case Op_VecY:
1622       case Op_VecZ:
1623         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1624         break;
1625       default:
1626         ShouldNotReachHere();
1627       }
1628     } else { // store
1629       switch (ireg) {
1630       case Op_VecS:
1631         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1632         break;
1633       case Op_VecD:
1634         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1635         break;
1636        case Op_VecX:
1637         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1638         break;
1639       case Op_VecY:
1640       case Op_VecZ:
1641         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1642         break;
1643       default:
1644         ShouldNotReachHere();
1645       }
1646     }
1647 #endif
1648   }
1649   bool is_single_byte = false;
1650   int vec_len = 0;
1651   if ((UseAVX > 2) && (stack_offset != 0)) {
1652     int tuple_type = Assembler::EVEX_FVM;
1653     int input_size = Assembler::EVEX_32bit;
1654     switch (ireg) {
1655     case Op_VecS:
1656       tuple_type = Assembler::EVEX_T1S;
1657       break;
1658     case Op_VecD:
1659       tuple_type = Assembler::EVEX_T1S;
1660       input_size = Assembler::EVEX_64bit;
1661       break;
1662     case Op_VecX:
1663       break;
1664     case Op_VecY:
1665       vec_len = 1;
1666       break;
1667     case Op_VecZ:
1668       vec_len = 2;
1669       break;
1670     }
1671     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1672   }
1673   int offset_size = 0;
1674   int size = 5;
1675   if (UseAVX > 2 ) {
1676     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1677       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1678       size += 2; // Need an additional two bytes for EVEX encoding
1679     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1680       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1681     } else {
1682       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1683       size += 2; // Need an additional two bytes for EVEX encodding
1684     }
1685   } else {
1686     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1687   }
1688   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1689   return size+offset_size;
1690 }
1691 
1692 static inline jint replicate4_imm(int con, int width) {
1693   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1694   assert(width == 1 || width == 2, "only byte or short types here");
1695   int bit_width = width * 8;
1696   jint val = con;
1697   val &= (1 << bit_width) - 1;  // mask off sign bits
1698   while(bit_width < 32) {
1699     val |= (val << bit_width);
1700     bit_width <<= 1;
1701   }
1702   return val;
1703 }
1704 
1705 static inline jlong replicate8_imm(int con, int width) {
1706   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1707   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1708   int bit_width = width * 8;
1709   jlong val = con;
1710   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1711   while(bit_width < 64) {
1712     val |= (val << bit_width);
1713     bit_width <<= 1;
1714   }
1715   return val;
1716 }
1717 
1718 #ifndef PRODUCT
1719   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1720     st->print("nop \t# %d bytes pad for loops and calls", _count);
1721   }
1722 #endif
1723 
1724   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1725     MacroAssembler _masm(&cbuf);
1726     __ nop(_count);
1727   }
1728 
1729   uint MachNopNode::size(PhaseRegAlloc*) const {
1730     return _count;
1731   }
1732 
1733 #ifndef PRODUCT
1734   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1735     st->print("# breakpoint");
1736   }
1737 #endif
1738 
1739   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1740     MacroAssembler _masm(&cbuf);
1741     __ int3();
1742   }
1743 
1744   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1745     return MachNode::size(ra_);
1746   }
1747 
1748 %}
1749 
1750 encode %{
1751 
1752   enc_class call_epilog %{
1753     if (VerifyStackAtCalls) {
1754       // Check that stack depth is unchanged: find majik cookie on stack
1755       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1756       MacroAssembler _masm(&cbuf);
1757       Label L;
1758       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1759       __ jccb(Assembler::equal, L);
1760       // Die if stack mismatch
1761       __ int3();
1762       __ bind(L);
1763     }
1764   %}
1765 
1766 %}
1767 
1768 
1769 //----------OPERANDS-----------------------------------------------------------
1770 // Operand definitions must precede instruction definitions for correct parsing
1771 // in the ADLC because operands constitute user defined types which are used in
1772 // instruction definitions.
1773 
1774 // This one generically applies only for evex, so only one version
1775 operand vecZ() %{
1776   constraint(ALLOC_IN_RC(vectorz_reg));
1777   match(VecZ);
1778 
1779   format %{ %}
1780   interface(REG_INTER);
1781 %}
1782 
1783 // Comparison Code for FP conditional move
1784 operand cmpOp_vcmppd() %{
1785   match(Bool);
1786 
1787   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
1788             n->as_Bool()->_test._test != BoolTest::no_overflow);
1789   format %{ "" %}
1790   interface(COND_INTER) %{
1791     equal        (0x0, "eq");
1792     less         (0x1, "lt");
1793     less_equal   (0x2, "le");
1794     not_equal    (0xC, "ne");
1795     greater_equal(0xD, "ge");
1796     greater      (0xE, "gt");
1797     //TODO cannot compile (adlc breaks) without two next lines with error:
1798     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
1799     // equal' for overflow.
1800     overflow     (0x20, "o");  // not really supported by the instruction
1801     no_overflow  (0x21, "no"); // not really supported by the instruction
1802   %}
1803 %}
1804 
1805 
1806 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1807 
1808 // ============================================================================
1809 
1810 instruct ShouldNotReachHere() %{
1811   match(Halt);
1812   format %{ "ud2\t# ShouldNotReachHere" %}
1813   ins_encode %{
1814     __ ud2();
1815   %}
1816   ins_pipe(pipe_slow);
1817 %}
1818 
1819 // =================================EVEX special===============================
1820 
1821 instruct setMask(rRegI dst, rRegI src) %{
1822   predicate(Matcher::has_predicated_vectors());
1823   match(Set dst (SetVectMaskI  src));
1824   effect(TEMP dst);
1825   format %{ "setvectmask   $dst, $src" %}
1826   ins_encode %{
1827     __ setvectmask($dst$$Register, $src$$Register);
1828   %}
1829   ins_pipe(pipe_slow);
1830 %}
1831 
1832 // ============================================================================
1833 
1834 instruct addF_reg(regF dst, regF src) %{
1835   predicate((UseSSE>=1) && (UseAVX == 0));
1836   match(Set dst (AddF dst src));
1837 
1838   format %{ "addss   $dst, $src" %}
1839   ins_cost(150);
1840   ins_encode %{
1841     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1842   %}
1843   ins_pipe(pipe_slow);
1844 %}
1845 
1846 instruct addF_mem(regF dst, memory src) %{
1847   predicate((UseSSE>=1) && (UseAVX == 0));
1848   match(Set dst (AddF dst (LoadF src)));
1849 
1850   format %{ "addss   $dst, $src" %}
1851   ins_cost(150);
1852   ins_encode %{
1853     __ addss($dst$$XMMRegister, $src$$Address);
1854   %}
1855   ins_pipe(pipe_slow);
1856 %}
1857 
1858 instruct addF_imm(regF dst, immF con) %{
1859   predicate((UseSSE>=1) && (UseAVX == 0));
1860   match(Set dst (AddF dst con));
1861   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1862   ins_cost(150);
1863   ins_encode %{
1864     __ addss($dst$$XMMRegister, $constantaddress($con));
1865   %}
1866   ins_pipe(pipe_slow);
1867 %}
1868 
1869 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1870   predicate(UseAVX > 0);
1871   match(Set dst (AddF src1 src2));
1872 
1873   format %{ "vaddss  $dst, $src1, $src2" %}
1874   ins_cost(150);
1875   ins_encode %{
1876     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1877   %}
1878   ins_pipe(pipe_slow);
1879 %}
1880 
1881 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1882   predicate(UseAVX > 0);
1883   match(Set dst (AddF src1 (LoadF src2)));
1884 
1885   format %{ "vaddss  $dst, $src1, $src2" %}
1886   ins_cost(150);
1887   ins_encode %{
1888     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1889   %}
1890   ins_pipe(pipe_slow);
1891 %}
1892 
1893 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1894   predicate(UseAVX > 0);
1895   match(Set dst (AddF src con));
1896 
1897   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1898   ins_cost(150);
1899   ins_encode %{
1900     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1901   %}
1902   ins_pipe(pipe_slow);
1903 %}
1904 
1905 instruct addD_reg(regD dst, regD src) %{
1906   predicate((UseSSE>=2) && (UseAVX == 0));
1907   match(Set dst (AddD dst src));
1908 
1909   format %{ "addsd   $dst, $src" %}
1910   ins_cost(150);
1911   ins_encode %{
1912     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1913   %}
1914   ins_pipe(pipe_slow);
1915 %}
1916 
1917 instruct addD_mem(regD dst, memory src) %{
1918   predicate((UseSSE>=2) && (UseAVX == 0));
1919   match(Set dst (AddD dst (LoadD src)));
1920 
1921   format %{ "addsd   $dst, $src" %}
1922   ins_cost(150);
1923   ins_encode %{
1924     __ addsd($dst$$XMMRegister, $src$$Address);
1925   %}
1926   ins_pipe(pipe_slow);
1927 %}
1928 
1929 instruct addD_imm(regD dst, immD con) %{
1930   predicate((UseSSE>=2) && (UseAVX == 0));
1931   match(Set dst (AddD dst con));
1932   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1933   ins_cost(150);
1934   ins_encode %{
1935     __ addsd($dst$$XMMRegister, $constantaddress($con));
1936   %}
1937   ins_pipe(pipe_slow);
1938 %}
1939 
1940 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1941   predicate(UseAVX > 0);
1942   match(Set dst (AddD src1 src2));
1943 
1944   format %{ "vaddsd  $dst, $src1, $src2" %}
1945   ins_cost(150);
1946   ins_encode %{
1947     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1948   %}
1949   ins_pipe(pipe_slow);
1950 %}
1951 
1952 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1953   predicate(UseAVX > 0);
1954   match(Set dst (AddD src1 (LoadD src2)));
1955 
1956   format %{ "vaddsd  $dst, $src1, $src2" %}
1957   ins_cost(150);
1958   ins_encode %{
1959     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1960   %}
1961   ins_pipe(pipe_slow);
1962 %}
1963 
1964 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1965   predicate(UseAVX > 0);
1966   match(Set dst (AddD src con));
1967 
1968   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1969   ins_cost(150);
1970   ins_encode %{
1971     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1972   %}
1973   ins_pipe(pipe_slow);
1974 %}
1975 
1976 instruct subF_reg(regF dst, regF src) %{
1977   predicate((UseSSE>=1) && (UseAVX == 0));
1978   match(Set dst (SubF dst src));
1979 
1980   format %{ "subss   $dst, $src" %}
1981   ins_cost(150);
1982   ins_encode %{
1983     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1984   %}
1985   ins_pipe(pipe_slow);
1986 %}
1987 
1988 instruct subF_mem(regF dst, memory src) %{
1989   predicate((UseSSE>=1) && (UseAVX == 0));
1990   match(Set dst (SubF dst (LoadF src)));
1991 
1992   format %{ "subss   $dst, $src" %}
1993   ins_cost(150);
1994   ins_encode %{
1995     __ subss($dst$$XMMRegister, $src$$Address);
1996   %}
1997   ins_pipe(pipe_slow);
1998 %}
1999 
2000 instruct subF_imm(regF dst, immF con) %{
2001   predicate((UseSSE>=1) && (UseAVX == 0));
2002   match(Set dst (SubF dst con));
2003   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2004   ins_cost(150);
2005   ins_encode %{
2006     __ subss($dst$$XMMRegister, $constantaddress($con));
2007   %}
2008   ins_pipe(pipe_slow);
2009 %}
2010 
2011 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2012   predicate(UseAVX > 0);
2013   match(Set dst (SubF src1 src2));
2014 
2015   format %{ "vsubss  $dst, $src1, $src2" %}
2016   ins_cost(150);
2017   ins_encode %{
2018     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2019   %}
2020   ins_pipe(pipe_slow);
2021 %}
2022 
2023 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2024   predicate(UseAVX > 0);
2025   match(Set dst (SubF src1 (LoadF src2)));
2026 
2027   format %{ "vsubss  $dst, $src1, $src2" %}
2028   ins_cost(150);
2029   ins_encode %{
2030     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2031   %}
2032   ins_pipe(pipe_slow);
2033 %}
2034 
2035 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2036   predicate(UseAVX > 0);
2037   match(Set dst (SubF src con));
2038 
2039   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2040   ins_cost(150);
2041   ins_encode %{
2042     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2043   %}
2044   ins_pipe(pipe_slow);
2045 %}
2046 
2047 instruct subD_reg(regD dst, regD src) %{
2048   predicate((UseSSE>=2) && (UseAVX == 0));
2049   match(Set dst (SubD dst src));
2050 
2051   format %{ "subsd   $dst, $src" %}
2052   ins_cost(150);
2053   ins_encode %{
2054     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2055   %}
2056   ins_pipe(pipe_slow);
2057 %}
2058 
2059 instruct subD_mem(regD dst, memory src) %{
2060   predicate((UseSSE>=2) && (UseAVX == 0));
2061   match(Set dst (SubD dst (LoadD src)));
2062 
2063   format %{ "subsd   $dst, $src" %}
2064   ins_cost(150);
2065   ins_encode %{
2066     __ subsd($dst$$XMMRegister, $src$$Address);
2067   %}
2068   ins_pipe(pipe_slow);
2069 %}
2070 
2071 instruct subD_imm(regD dst, immD con) %{
2072   predicate((UseSSE>=2) && (UseAVX == 0));
2073   match(Set dst (SubD dst con));
2074   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2075   ins_cost(150);
2076   ins_encode %{
2077     __ subsd($dst$$XMMRegister, $constantaddress($con));
2078   %}
2079   ins_pipe(pipe_slow);
2080 %}
2081 
2082 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2083   predicate(UseAVX > 0);
2084   match(Set dst (SubD src1 src2));
2085 
2086   format %{ "vsubsd  $dst, $src1, $src2" %}
2087   ins_cost(150);
2088   ins_encode %{
2089     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2090   %}
2091   ins_pipe(pipe_slow);
2092 %}
2093 
2094 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2095   predicate(UseAVX > 0);
2096   match(Set dst (SubD src1 (LoadD src2)));
2097 
2098   format %{ "vsubsd  $dst, $src1, $src2" %}
2099   ins_cost(150);
2100   ins_encode %{
2101     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2102   %}
2103   ins_pipe(pipe_slow);
2104 %}
2105 
2106 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2107   predicate(UseAVX > 0);
2108   match(Set dst (SubD src con));
2109 
2110   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2111   ins_cost(150);
2112   ins_encode %{
2113     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2114   %}
2115   ins_pipe(pipe_slow);
2116 %}
2117 
2118 instruct mulF_reg(regF dst, regF src) %{
2119   predicate((UseSSE>=1) && (UseAVX == 0));
2120   match(Set dst (MulF dst src));
2121 
2122   format %{ "mulss   $dst, $src" %}
2123   ins_cost(150);
2124   ins_encode %{
2125     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2126   %}
2127   ins_pipe(pipe_slow);
2128 %}
2129 
2130 instruct mulF_mem(regF dst, memory src) %{
2131   predicate((UseSSE>=1) && (UseAVX == 0));
2132   match(Set dst (MulF dst (LoadF src)));
2133 
2134   format %{ "mulss   $dst, $src" %}
2135   ins_cost(150);
2136   ins_encode %{
2137     __ mulss($dst$$XMMRegister, $src$$Address);
2138   %}
2139   ins_pipe(pipe_slow);
2140 %}
2141 
2142 instruct mulF_imm(regF dst, immF con) %{
2143   predicate((UseSSE>=1) && (UseAVX == 0));
2144   match(Set dst (MulF dst con));
2145   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2146   ins_cost(150);
2147   ins_encode %{
2148     __ mulss($dst$$XMMRegister, $constantaddress($con));
2149   %}
2150   ins_pipe(pipe_slow);
2151 %}
2152 
2153 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2154   predicate(UseAVX > 0);
2155   match(Set dst (MulF src1 src2));
2156 
2157   format %{ "vmulss  $dst, $src1, $src2" %}
2158   ins_cost(150);
2159   ins_encode %{
2160     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2161   %}
2162   ins_pipe(pipe_slow);
2163 %}
2164 
2165 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2166   predicate(UseAVX > 0);
2167   match(Set dst (MulF src1 (LoadF src2)));
2168 
2169   format %{ "vmulss  $dst, $src1, $src2" %}
2170   ins_cost(150);
2171   ins_encode %{
2172     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2173   %}
2174   ins_pipe(pipe_slow);
2175 %}
2176 
2177 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2178   predicate(UseAVX > 0);
2179   match(Set dst (MulF src con));
2180 
2181   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2182   ins_cost(150);
2183   ins_encode %{
2184     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2185   %}
2186   ins_pipe(pipe_slow);
2187 %}
2188 
2189 instruct mulD_reg(regD dst, regD src) %{
2190   predicate((UseSSE>=2) && (UseAVX == 0));
2191   match(Set dst (MulD dst src));
2192 
2193   format %{ "mulsd   $dst, $src" %}
2194   ins_cost(150);
2195   ins_encode %{
2196     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2197   %}
2198   ins_pipe(pipe_slow);
2199 %}
2200 
2201 instruct mulD_mem(regD dst, memory src) %{
2202   predicate((UseSSE>=2) && (UseAVX == 0));
2203   match(Set dst (MulD dst (LoadD src)));
2204 
2205   format %{ "mulsd   $dst, $src" %}
2206   ins_cost(150);
2207   ins_encode %{
2208     __ mulsd($dst$$XMMRegister, $src$$Address);
2209   %}
2210   ins_pipe(pipe_slow);
2211 %}
2212 
2213 instruct mulD_imm(regD dst, immD con) %{
2214   predicate((UseSSE>=2) && (UseAVX == 0));
2215   match(Set dst (MulD dst con));
2216   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2217   ins_cost(150);
2218   ins_encode %{
2219     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2220   %}
2221   ins_pipe(pipe_slow);
2222 %}
2223 
2224 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2225   predicate(UseAVX > 0);
2226   match(Set dst (MulD src1 src2));
2227 
2228   format %{ "vmulsd  $dst, $src1, $src2" %}
2229   ins_cost(150);
2230   ins_encode %{
2231     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2232   %}
2233   ins_pipe(pipe_slow);
2234 %}
2235 
2236 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2237   predicate(UseAVX > 0);
2238   match(Set dst (MulD src1 (LoadD src2)));
2239 
2240   format %{ "vmulsd  $dst, $src1, $src2" %}
2241   ins_cost(150);
2242   ins_encode %{
2243     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2244   %}
2245   ins_pipe(pipe_slow);
2246 %}
2247 
2248 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2249   predicate(UseAVX > 0);
2250   match(Set dst (MulD src con));
2251 
2252   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2253   ins_cost(150);
2254   ins_encode %{
2255     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2256   %}
2257   ins_pipe(pipe_slow);
2258 %}
2259 
2260 instruct divF_reg(regF dst, regF src) %{
2261   predicate((UseSSE>=1) && (UseAVX == 0));
2262   match(Set dst (DivF dst src));
2263 
2264   format %{ "divss   $dst, $src" %}
2265   ins_cost(150);
2266   ins_encode %{
2267     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2268   %}
2269   ins_pipe(pipe_slow);
2270 %}
2271 
2272 instruct divF_mem(regF dst, memory src) %{
2273   predicate((UseSSE>=1) && (UseAVX == 0));
2274   match(Set dst (DivF dst (LoadF src)));
2275 
2276   format %{ "divss   $dst, $src" %}
2277   ins_cost(150);
2278   ins_encode %{
2279     __ divss($dst$$XMMRegister, $src$$Address);
2280   %}
2281   ins_pipe(pipe_slow);
2282 %}
2283 
2284 instruct divF_imm(regF dst, immF con) %{
2285   predicate((UseSSE>=1) && (UseAVX == 0));
2286   match(Set dst (DivF dst con));
2287   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2288   ins_cost(150);
2289   ins_encode %{
2290     __ divss($dst$$XMMRegister, $constantaddress($con));
2291   %}
2292   ins_pipe(pipe_slow);
2293 %}
2294 
2295 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2296   predicate(UseAVX > 0);
2297   match(Set dst (DivF src1 src2));
2298 
2299   format %{ "vdivss  $dst, $src1, $src2" %}
2300   ins_cost(150);
2301   ins_encode %{
2302     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2303   %}
2304   ins_pipe(pipe_slow);
2305 %}
2306 
2307 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2308   predicate(UseAVX > 0);
2309   match(Set dst (DivF src1 (LoadF src2)));
2310 
2311   format %{ "vdivss  $dst, $src1, $src2" %}
2312   ins_cost(150);
2313   ins_encode %{
2314     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2315   %}
2316   ins_pipe(pipe_slow);
2317 %}
2318 
2319 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2320   predicate(UseAVX > 0);
2321   match(Set dst (DivF src con));
2322 
2323   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2324   ins_cost(150);
2325   ins_encode %{
2326     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2327   %}
2328   ins_pipe(pipe_slow);
2329 %}
2330 
2331 instruct divD_reg(regD dst, regD src) %{
2332   predicate((UseSSE>=2) && (UseAVX == 0));
2333   match(Set dst (DivD dst src));
2334 
2335   format %{ "divsd   $dst, $src" %}
2336   ins_cost(150);
2337   ins_encode %{
2338     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2339   %}
2340   ins_pipe(pipe_slow);
2341 %}
2342 
2343 instruct divD_mem(regD dst, memory src) %{
2344   predicate((UseSSE>=2) && (UseAVX == 0));
2345   match(Set dst (DivD dst (LoadD src)));
2346 
2347   format %{ "divsd   $dst, $src" %}
2348   ins_cost(150);
2349   ins_encode %{
2350     __ divsd($dst$$XMMRegister, $src$$Address);
2351   %}
2352   ins_pipe(pipe_slow);
2353 %}
2354 
2355 instruct divD_imm(regD dst, immD con) %{
2356   predicate((UseSSE>=2) && (UseAVX == 0));
2357   match(Set dst (DivD dst con));
2358   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2359   ins_cost(150);
2360   ins_encode %{
2361     __ divsd($dst$$XMMRegister, $constantaddress($con));
2362   %}
2363   ins_pipe(pipe_slow);
2364 %}
2365 
2366 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2367   predicate(UseAVX > 0);
2368   match(Set dst (DivD src1 src2));
2369 
2370   format %{ "vdivsd  $dst, $src1, $src2" %}
2371   ins_cost(150);
2372   ins_encode %{
2373     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2374   %}
2375   ins_pipe(pipe_slow);
2376 %}
2377 
2378 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2379   predicate(UseAVX > 0);
2380   match(Set dst (DivD src1 (LoadD src2)));
2381 
2382   format %{ "vdivsd  $dst, $src1, $src2" %}
2383   ins_cost(150);
2384   ins_encode %{
2385     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2386   %}
2387   ins_pipe(pipe_slow);
2388 %}
2389 
2390 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2391   predicate(UseAVX > 0);
2392   match(Set dst (DivD src con));
2393 
2394   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2395   ins_cost(150);
2396   ins_encode %{
2397     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2398   %}
2399   ins_pipe(pipe_slow);
2400 %}
2401 
2402 instruct absF_reg(regF dst) %{
2403   predicate((UseSSE>=1) && (UseAVX == 0));
2404   match(Set dst (AbsF dst));
2405   ins_cost(150);
2406   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2407   ins_encode %{
2408     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2409   %}
2410   ins_pipe(pipe_slow);
2411 %}
2412 
2413 instruct absF_reg_reg(regF dst, regF src) %{
2414   predicate(VM_Version::supports_avxonly());
2415   match(Set dst (AbsF src));
2416   ins_cost(150);
2417   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2418   ins_encode %{
2419     int vector_len = 0;
2420     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2421               ExternalAddress(float_signmask()), vector_len);
2422   %}
2423   ins_pipe(pipe_slow);
2424 %}
2425 
2426 #ifdef _LP64
2427 instruct absF_reg_reg_evex(regF dst, regF src) %{
2428   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2429   match(Set dst (AbsF src));
2430   ins_cost(150);
2431   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2432   ins_encode %{
2433     int vector_len = 0;
2434     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2435               ExternalAddress(float_signmask()), vector_len);
2436   %}
2437   ins_pipe(pipe_slow);
2438 %}
2439 
2440 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2441   predicate(VM_Version::supports_avx512novl());
2442   match(Set dst (AbsF src1));
2443   effect(TEMP src2);
2444   ins_cost(150);
2445   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2446   ins_encode %{
2447     int vector_len = 0;
2448     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2449               ExternalAddress(float_signmask()), vector_len);
2450   %}
2451   ins_pipe(pipe_slow);
2452 %}
2453 #else // _LP64
2454 instruct absF_reg_reg_evex(regF dst, regF src) %{
2455   predicate(UseAVX > 2);
2456   match(Set dst (AbsF src));
2457   ins_cost(150);
2458   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2459   ins_encode %{
2460     int vector_len = 0;
2461     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2462               ExternalAddress(float_signmask()), vector_len);
2463   %}
2464   ins_pipe(pipe_slow);
2465 %}
2466 #endif
2467 
2468 instruct absD_reg(regD dst) %{
2469   predicate((UseSSE>=2) && (UseAVX == 0));
2470   match(Set dst (AbsD dst));
2471   ins_cost(150);
2472   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2473             "# abs double by sign masking" %}
2474   ins_encode %{
2475     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2476   %}
2477   ins_pipe(pipe_slow);
2478 %}
2479 
2480 instruct absD_reg_reg(regD dst, regD src) %{
2481   predicate(VM_Version::supports_avxonly());
2482   match(Set dst (AbsD src));
2483   ins_cost(150);
2484   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2485             "# abs double by sign masking" %}
2486   ins_encode %{
2487     int vector_len = 0;
2488     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2489               ExternalAddress(double_signmask()), vector_len);
2490   %}
2491   ins_pipe(pipe_slow);
2492 %}
2493 
2494 #ifdef _LP64
2495 instruct absD_reg_reg_evex(regD dst, regD src) %{
2496   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2497   match(Set dst (AbsD src));
2498   ins_cost(150);
2499   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2500             "# abs double by sign masking" %}
2501   ins_encode %{
2502     int vector_len = 0;
2503     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2504               ExternalAddress(double_signmask()), vector_len);
2505   %}
2506   ins_pipe(pipe_slow);
2507 %}
2508 
2509 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2510   predicate(VM_Version::supports_avx512novl());
2511   match(Set dst (AbsD src1));
2512   effect(TEMP src2);
2513   ins_cost(150);
2514   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2515   ins_encode %{
2516     int vector_len = 0;
2517     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2518               ExternalAddress(double_signmask()), vector_len);
2519   %}
2520   ins_pipe(pipe_slow);
2521 %}
2522 #else // _LP64
2523 instruct absD_reg_reg_evex(regD dst, regD src) %{
2524   predicate(UseAVX > 2);
2525   match(Set dst (AbsD src));
2526   ins_cost(150);
2527   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2528             "# abs double by sign masking" %}
2529   ins_encode %{
2530     int vector_len = 0;
2531     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2532               ExternalAddress(double_signmask()), vector_len);
2533   %}
2534   ins_pipe(pipe_slow);
2535 %}
2536 #endif
2537 
2538 instruct negF_reg(regF dst) %{
2539   predicate((UseSSE>=1) && (UseAVX == 0));
2540   match(Set dst (NegF dst));
2541   ins_cost(150);
2542   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2543   ins_encode %{
2544     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2545   %}
2546   ins_pipe(pipe_slow);
2547 %}
2548 
2549 instruct negF_reg_reg(regF dst, regF src) %{
2550   predicate(UseAVX > 0);
2551   match(Set dst (NegF src));
2552   ins_cost(150);
2553   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2554   ins_encode %{
2555     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2556                  ExternalAddress(float_signflip()));
2557   %}
2558   ins_pipe(pipe_slow);
2559 %}
2560 
2561 instruct negD_reg(regD dst) %{
2562   predicate((UseSSE>=2) && (UseAVX == 0));
2563   match(Set dst (NegD dst));
2564   ins_cost(150);
2565   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2566             "# neg double by sign flipping" %}
2567   ins_encode %{
2568     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2569   %}
2570   ins_pipe(pipe_slow);
2571 %}
2572 
2573 instruct negD_reg_reg(regD dst, regD src) %{
2574   predicate(UseAVX > 0);
2575   match(Set dst (NegD src));
2576   ins_cost(150);
2577   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2578             "# neg double by sign flipping" %}
2579   ins_encode %{
2580     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2581                  ExternalAddress(double_signflip()));
2582   %}
2583   ins_pipe(pipe_slow);
2584 %}
2585 
2586 instruct sqrtF_reg(regF dst, regF src) %{
2587   predicate(UseSSE>=1);
2588   match(Set dst (SqrtF src));
2589 
2590   format %{ "sqrtss  $dst, $src" %}
2591   ins_cost(150);
2592   ins_encode %{
2593     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2594   %}
2595   ins_pipe(pipe_slow);
2596 %}
2597 
2598 instruct sqrtF_mem(regF dst, memory src) %{
2599   predicate(UseSSE>=1);
2600   match(Set dst (SqrtF (LoadF src)));
2601 
2602   format %{ "sqrtss  $dst, $src" %}
2603   ins_cost(150);
2604   ins_encode %{
2605     __ sqrtss($dst$$XMMRegister, $src$$Address);
2606   %}
2607   ins_pipe(pipe_slow);
2608 %}
2609 
2610 instruct sqrtF_imm(regF dst, immF con) %{
2611   predicate(UseSSE>=1);
2612   match(Set dst (SqrtF con));
2613 
2614   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2615   ins_cost(150);
2616   ins_encode %{
2617     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2618   %}
2619   ins_pipe(pipe_slow);
2620 %}
2621 
2622 instruct sqrtD_reg(regD dst, regD src) %{
2623   predicate(UseSSE>=2);
2624   match(Set dst (SqrtD src));
2625 
2626   format %{ "sqrtsd  $dst, $src" %}
2627   ins_cost(150);
2628   ins_encode %{
2629     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2630   %}
2631   ins_pipe(pipe_slow);
2632 %}
2633 
2634 instruct sqrtD_mem(regD dst, memory src) %{
2635   predicate(UseSSE>=2);
2636   match(Set dst (SqrtD (LoadD src)));
2637 
2638   format %{ "sqrtsd  $dst, $src" %}
2639   ins_cost(150);
2640   ins_encode %{
2641     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2642   %}
2643   ins_pipe(pipe_slow);
2644 %}
2645 
2646 instruct sqrtD_imm(regD dst, immD con) %{
2647   predicate(UseSSE>=2);
2648   match(Set dst (SqrtD con));
2649   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2650   ins_cost(150);
2651   ins_encode %{
2652     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2653   %}
2654   ins_pipe(pipe_slow);
2655 %}
2656 
2657 instruct onspinwait() %{
2658   match(OnSpinWait);
2659   ins_cost(200);
2660 
2661   format %{
2662     $$template
2663     if (os::is_MP()) {
2664       $$emit$$"pause\t! membar_onspinwait"
2665     } else {
2666       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
2667     }
2668   %}
2669   ins_encode %{
2670     __ pause();
2671   %}
2672   ins_pipe(pipe_slow);
2673 %}
2674 
2675 // a * b + c
2676 instruct fmaD_reg(regD a, regD b, regD c) %{
2677   predicate(UseFMA);
2678   match(Set c (FmaD  c (Binary a b)));
2679   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2680   ins_cost(150);
2681   ins_encode %{
2682     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2683   %}
2684   ins_pipe( pipe_slow );
2685 %}
2686 
2687 // a * b + c
2688 instruct fmaF_reg(regF a, regF b, regF c) %{
2689   predicate(UseFMA);
2690   match(Set c (FmaF  c (Binary a b)));
2691   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2692   ins_cost(150);
2693   ins_encode %{
2694     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2695   %}
2696   ins_pipe( pipe_slow );
2697 %}
2698 
2699 // ====================VECTOR INSTRUCTIONS=====================================
2700 
2701 // Load vectors (4 bytes long)
2702 instruct loadV4(vecS dst, memory mem) %{
2703   predicate(n->as_LoadVector()->memory_size() == 4);
2704   match(Set dst (LoadVector mem));
2705   ins_cost(125);
2706   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2707   ins_encode %{
2708     __ movdl($dst$$XMMRegister, $mem$$Address);
2709   %}
2710   ins_pipe( pipe_slow );
2711 %}
2712 
2713 // Load vectors (8 bytes long)
2714 instruct loadV8(vecD dst, memory mem) %{
2715   predicate(n->as_LoadVector()->memory_size() == 8);
2716   match(Set dst (LoadVector mem));
2717   ins_cost(125);
2718   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2719   ins_encode %{
2720     __ movq($dst$$XMMRegister, $mem$$Address);
2721   %}
2722   ins_pipe( pipe_slow );
2723 %}
2724 
2725 // Load vectors (16 bytes long)
2726 instruct loadV16(vecX dst, memory mem) %{
2727   predicate(n->as_LoadVector()->memory_size() == 16);
2728   match(Set dst (LoadVector mem));
2729   ins_cost(125);
2730   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2731   ins_encode %{
2732     __ movdqu($dst$$XMMRegister, $mem$$Address);
2733   %}
2734   ins_pipe( pipe_slow );
2735 %}
2736 
2737 // Load vectors (32 bytes long)
2738 instruct loadV32(vecY dst, memory mem) %{
2739   predicate(n->as_LoadVector()->memory_size() == 32);
2740   match(Set dst (LoadVector mem));
2741   ins_cost(125);
2742   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2743   ins_encode %{
2744     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2745   %}
2746   ins_pipe( pipe_slow );
2747 %}
2748 
2749 // Load vectors (64 bytes long)
2750 instruct loadV64_dword(vecZ dst, memory mem) %{
2751   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2752   match(Set dst (LoadVector mem));
2753   ins_cost(125);
2754   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
2755   ins_encode %{
2756     int vector_len = 2;
2757     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
2758   %}
2759   ins_pipe( pipe_slow );
2760 %}
2761 
2762 // Load vectors (64 bytes long)
2763 instruct loadV64_qword(vecZ dst, memory mem) %{
2764   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
2765   match(Set dst (LoadVector mem));
2766   ins_cost(125);
2767   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
2768   ins_encode %{
2769     int vector_len = 2;
2770     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
2771   %}
2772   ins_pipe( pipe_slow );
2773 %}
2774 
2775 // Store vectors
2776 instruct storeV4(memory mem, vecS src) %{
2777   predicate(n->as_StoreVector()->memory_size() == 4);
2778   match(Set mem (StoreVector mem src));
2779   ins_cost(145);
2780   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
2781   ins_encode %{
2782     __ movdl($mem$$Address, $src$$XMMRegister);
2783   %}
2784   ins_pipe( pipe_slow );
2785 %}
2786 
2787 instruct storeV8(memory mem, vecD src) %{
2788   predicate(n->as_StoreVector()->memory_size() == 8);
2789   match(Set mem (StoreVector mem src));
2790   ins_cost(145);
2791   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
2792   ins_encode %{
2793     __ movq($mem$$Address, $src$$XMMRegister);
2794   %}
2795   ins_pipe( pipe_slow );
2796 %}
2797 
2798 instruct storeV16(memory mem, vecX src) %{
2799   predicate(n->as_StoreVector()->memory_size() == 16);
2800   match(Set mem (StoreVector mem src));
2801   ins_cost(145);
2802   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
2803   ins_encode %{
2804     __ movdqu($mem$$Address, $src$$XMMRegister);
2805   %}
2806   ins_pipe( pipe_slow );
2807 %}
2808 
2809 instruct storeV32(memory mem, vecY src) %{
2810   predicate(n->as_StoreVector()->memory_size() == 32);
2811   match(Set mem (StoreVector mem src));
2812   ins_cost(145);
2813   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2814   ins_encode %{
2815     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2816   %}
2817   ins_pipe( pipe_slow );
2818 %}
2819 
2820 instruct storeV64_dword(memory mem, vecZ src) %{
2821   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
2822   match(Set mem (StoreVector mem src));
2823   ins_cost(145);
2824   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
2825   ins_encode %{
2826     int vector_len = 2;
2827     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
2828   %}
2829   ins_pipe( pipe_slow );
2830 %}
2831 
2832 instruct storeV64_qword(memory mem, vecZ src) %{
2833   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
2834   match(Set mem (StoreVector mem src));
2835   ins_cost(145);
2836   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
2837   ins_encode %{
2838     int vector_len = 2;
2839     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
2840   %}
2841   ins_pipe( pipe_slow );
2842 %}
2843 
2844 // ====================LEGACY REPLICATE=======================================
2845 
2846 instruct Repl4B_mem(vecS dst, memory mem) %{
2847   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2848   match(Set dst (ReplicateB (LoadB mem)));
2849   format %{ "punpcklbw $dst,$mem\n\t"
2850             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
2851   ins_encode %{
2852     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2853     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2854   %}
2855   ins_pipe( pipe_slow );
2856 %}
2857 
2858 instruct Repl8B_mem(vecD dst, memory mem) %{
2859   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2860   match(Set dst (ReplicateB (LoadB mem)));
2861   format %{ "punpcklbw $dst,$mem\n\t"
2862             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
2863   ins_encode %{
2864     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2865     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2866   %}
2867   ins_pipe( pipe_slow );
2868 %}
2869 
2870 instruct Repl16B(vecX dst, rRegI src) %{
2871   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2872   match(Set dst (ReplicateB src));
2873   format %{ "movd    $dst,$src\n\t"
2874             "punpcklbw $dst,$dst\n\t"
2875             "pshuflw $dst,$dst,0x00\n\t"
2876             "punpcklqdq $dst,$dst\t! replicate16B" %}
2877   ins_encode %{
2878     __ movdl($dst$$XMMRegister, $src$$Register);
2879     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2880     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2881     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2882   %}
2883   ins_pipe( pipe_slow );
2884 %}
2885 
2886 instruct Repl16B_mem(vecX dst, memory mem) %{
2887   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2888   match(Set dst (ReplicateB (LoadB mem)));
2889   format %{ "punpcklbw $dst,$mem\n\t"
2890             "pshuflw $dst,$dst,0x00\n\t"
2891             "punpcklqdq $dst,$dst\t! replicate16B" %}
2892   ins_encode %{
2893     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2894     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2895     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2896   %}
2897   ins_pipe( pipe_slow );
2898 %}
2899 
2900 instruct Repl32B(vecY dst, rRegI src) %{
2901   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2902   match(Set dst (ReplicateB src));
2903   format %{ "movd    $dst,$src\n\t"
2904             "punpcklbw $dst,$dst\n\t"
2905             "pshuflw $dst,$dst,0x00\n\t"
2906             "punpcklqdq $dst,$dst\n\t"
2907             "vinserti128_high $dst,$dst\t! replicate32B" %}
2908   ins_encode %{
2909     __ movdl($dst$$XMMRegister, $src$$Register);
2910     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2911     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2912     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2913     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2914   %}
2915   ins_pipe( pipe_slow );
2916 %}
2917 
2918 instruct Repl32B_mem(vecY dst, memory mem) %{
2919   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2920   match(Set dst (ReplicateB (LoadB mem)));
2921   format %{ "punpcklbw $dst,$mem\n\t"
2922             "pshuflw $dst,$dst,0x00\n\t"
2923             "punpcklqdq $dst,$dst\n\t"
2924             "vinserti128_high $dst,$dst\t! replicate32B" %}
2925   ins_encode %{
2926     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2927     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2928     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2929     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2930   %}
2931   ins_pipe( pipe_slow );
2932 %}
2933 
2934 instruct Repl16B_imm(vecX dst, immI con) %{
2935   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2936   match(Set dst (ReplicateB con));
2937   format %{ "movq    $dst,[$constantaddress]\n\t"
2938             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
2939   ins_encode %{
2940     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2941     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2942   %}
2943   ins_pipe( pipe_slow );
2944 %}
2945 
2946 instruct Repl32B_imm(vecY dst, immI con) %{
2947   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2948   match(Set dst (ReplicateB con));
2949   format %{ "movq    $dst,[$constantaddress]\n\t"
2950             "punpcklqdq $dst,$dst\n\t"
2951             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
2952   ins_encode %{
2953     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2954     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2955     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2956   %}
2957   ins_pipe( pipe_slow );
2958 %}
2959 
2960 instruct Repl4S(vecD dst, rRegI src) %{
2961   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
2962   match(Set dst (ReplicateS src));
2963   format %{ "movd    $dst,$src\n\t"
2964             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
2965   ins_encode %{
2966     __ movdl($dst$$XMMRegister, $src$$Register);
2967     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2968   %}
2969   ins_pipe( pipe_slow );
2970 %}
2971 
2972 instruct Repl4S_mem(vecD dst, memory mem) %{
2973   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2974   match(Set dst (ReplicateS (LoadS mem)));
2975   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
2976   ins_encode %{
2977     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
2978   %}
2979   ins_pipe( pipe_slow );
2980 %}
2981 
2982 instruct Repl8S(vecX dst, rRegI src) %{
2983   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
2984   match(Set dst (ReplicateS src));
2985   format %{ "movd    $dst,$src\n\t"
2986             "pshuflw $dst,$dst,0x00\n\t"
2987             "punpcklqdq $dst,$dst\t! replicate8S" %}
2988   ins_encode %{
2989     __ movdl($dst$$XMMRegister, $src$$Register);
2990     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2991     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2992   %}
2993   ins_pipe( pipe_slow );
2994 %}
2995 
2996 instruct Repl8S_mem(vecX dst, memory mem) %{
2997   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2998   match(Set dst (ReplicateS (LoadS mem)));
2999   format %{ "pshuflw $dst,$mem,0x00\n\t"
3000             "punpcklqdq $dst,$dst\t! replicate8S" %}
3001   ins_encode %{
3002     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3003     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3004   %}
3005   ins_pipe( pipe_slow );
3006 %}
3007 
3008 instruct Repl8S_imm(vecX dst, immI con) %{
3009   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3010   match(Set dst (ReplicateS con));
3011   format %{ "movq    $dst,[$constantaddress]\n\t"
3012             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3013   ins_encode %{
3014     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3015     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3016   %}
3017   ins_pipe( pipe_slow );
3018 %}
3019 
3020 instruct Repl16S(vecY dst, rRegI src) %{
3021   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3022   match(Set dst (ReplicateS src));
3023   format %{ "movd    $dst,$src\n\t"
3024             "pshuflw $dst,$dst,0x00\n\t"
3025             "punpcklqdq $dst,$dst\n\t"
3026             "vinserti128_high $dst,$dst\t! replicate16S" %}
3027   ins_encode %{
3028     __ movdl($dst$$XMMRegister, $src$$Register);
3029     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3030     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3031     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3032   %}
3033   ins_pipe( pipe_slow );
3034 %}
3035 
3036 instruct Repl16S_mem(vecY dst, memory mem) %{
3037   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3038   match(Set dst (ReplicateS (LoadS mem)));
3039   format %{ "pshuflw $dst,$mem,0x00\n\t"
3040             "punpcklqdq $dst,$dst\n\t"
3041             "vinserti128_high $dst,$dst\t! replicate16S" %}
3042   ins_encode %{
3043     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3044     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3045     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3046   %}
3047   ins_pipe( pipe_slow );
3048 %}
3049 
3050 instruct Repl16S_imm(vecY dst, immI con) %{
3051   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3052   match(Set dst (ReplicateS con));
3053   format %{ "movq    $dst,[$constantaddress]\n\t"
3054             "punpcklqdq $dst,$dst\n\t"
3055             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3056   ins_encode %{
3057     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3058     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3059     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3060   %}
3061   ins_pipe( pipe_slow );
3062 %}
3063 
3064 instruct Repl4I(vecX dst, rRegI src) %{
3065   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3066   match(Set dst (ReplicateI src));
3067   format %{ "movd    $dst,$src\n\t"
3068             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3069   ins_encode %{
3070     __ movdl($dst$$XMMRegister, $src$$Register);
3071     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3072   %}
3073   ins_pipe( pipe_slow );
3074 %}
3075 
3076 instruct Repl4I_mem(vecX dst, memory mem) %{
3077   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3078   match(Set dst (ReplicateI (LoadI mem)));
3079   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3080   ins_encode %{
3081     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3082   %}
3083   ins_pipe( pipe_slow );
3084 %}
3085 
3086 instruct Repl8I(vecY dst, rRegI src) %{
3087   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3088   match(Set dst (ReplicateI src));
3089   format %{ "movd    $dst,$src\n\t"
3090             "pshufd  $dst,$dst,0x00\n\t"
3091             "vinserti128_high $dst,$dst\t! replicate8I" %}
3092   ins_encode %{
3093     __ movdl($dst$$XMMRegister, $src$$Register);
3094     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3095     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3096   %}
3097   ins_pipe( pipe_slow );
3098 %}
3099 
3100 instruct Repl8I_mem(vecY dst, memory mem) %{
3101   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3102   match(Set dst (ReplicateI (LoadI mem)));
3103   format %{ "pshufd  $dst,$mem,0x00\n\t"
3104             "vinserti128_high $dst,$dst\t! replicate8I" %}
3105   ins_encode %{
3106     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3107     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3108   %}
3109   ins_pipe( pipe_slow );
3110 %}
3111 
3112 instruct Repl4I_imm(vecX dst, immI con) %{
3113   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3114   match(Set dst (ReplicateI con));
3115   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3116             "punpcklqdq $dst,$dst" %}
3117   ins_encode %{
3118     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3119     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3120   %}
3121   ins_pipe( pipe_slow );
3122 %}
3123 
3124 instruct Repl8I_imm(vecY dst, immI con) %{
3125   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3126   match(Set dst (ReplicateI con));
3127   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3128             "punpcklqdq $dst,$dst\n\t"
3129             "vinserti128_high $dst,$dst" %}
3130   ins_encode %{
3131     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3132     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3133     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3134   %}
3135   ins_pipe( pipe_slow );
3136 %}
3137 
3138 // Long could be loaded into xmm register directly from memory.
3139 instruct Repl2L_mem(vecX dst, memory mem) %{
3140   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3141   match(Set dst (ReplicateL (LoadL mem)));
3142   format %{ "movq    $dst,$mem\n\t"
3143             "punpcklqdq $dst,$dst\t! replicate2L" %}
3144   ins_encode %{
3145     __ movq($dst$$XMMRegister, $mem$$Address);
3146     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3147   %}
3148   ins_pipe( pipe_slow );
3149 %}
3150 
3151 // Replicate long (8 byte) scalar to be vector
3152 #ifdef _LP64
3153 instruct Repl4L(vecY dst, rRegL src) %{
3154   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3155   match(Set dst (ReplicateL src));
3156   format %{ "movdq   $dst,$src\n\t"
3157             "punpcklqdq $dst,$dst\n\t"
3158             "vinserti128_high $dst,$dst\t! replicate4L" %}
3159   ins_encode %{
3160     __ movdq($dst$$XMMRegister, $src$$Register);
3161     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3162     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3163   %}
3164   ins_pipe( pipe_slow );
3165 %}
3166 #else // _LP64
3167 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3168   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3169   match(Set dst (ReplicateL src));
3170   effect(TEMP dst, USE src, TEMP tmp);
3171   format %{ "movdl   $dst,$src.lo\n\t"
3172             "movdl   $tmp,$src.hi\n\t"
3173             "punpckldq $dst,$tmp\n\t"
3174             "punpcklqdq $dst,$dst\n\t"
3175             "vinserti128_high $dst,$dst\t! replicate4L" %}
3176   ins_encode %{
3177     __ movdl($dst$$XMMRegister, $src$$Register);
3178     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3179     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3180     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3181     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3182   %}
3183   ins_pipe( pipe_slow );
3184 %}
3185 #endif // _LP64
3186 
3187 instruct Repl4L_imm(vecY dst, immL con) %{
3188   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3189   match(Set dst (ReplicateL con));
3190   format %{ "movq    $dst,[$constantaddress]\n\t"
3191             "punpcklqdq $dst,$dst\n\t"
3192             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3193   ins_encode %{
3194     __ movq($dst$$XMMRegister, $constantaddress($con));
3195     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3196     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3197   %}
3198   ins_pipe( pipe_slow );
3199 %}
3200 
3201 instruct Repl4L_mem(vecY dst, memory mem) %{
3202   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3203   match(Set dst (ReplicateL (LoadL mem)));
3204   format %{ "movq    $dst,$mem\n\t"
3205             "punpcklqdq $dst,$dst\n\t"
3206             "vinserti128_high $dst,$dst\t! replicate4L" %}
3207   ins_encode %{
3208     __ movq($dst$$XMMRegister, $mem$$Address);
3209     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3210     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3211   %}
3212   ins_pipe( pipe_slow );
3213 %}
3214 
3215 instruct Repl2F_mem(vecD dst, memory mem) %{
3216   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3217   match(Set dst (ReplicateF (LoadF mem)));
3218   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3219   ins_encode %{
3220     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3221   %}
3222   ins_pipe( pipe_slow );
3223 %}
3224 
3225 instruct Repl4F_mem(vecX dst, memory mem) %{
3226   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3227   match(Set dst (ReplicateF (LoadF mem)));
3228   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3229   ins_encode %{
3230     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3231   %}
3232   ins_pipe( pipe_slow );
3233 %}
3234 
3235 instruct Repl8F(vecY dst, regF src) %{
3236   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3237   match(Set dst (ReplicateF src));
3238   format %{ "pshufd  $dst,$src,0x00\n\t"
3239             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3240   ins_encode %{
3241     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3242     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3243   %}
3244   ins_pipe( pipe_slow );
3245 %}
3246 
3247 instruct Repl8F_mem(vecY dst, memory mem) %{
3248   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3249   match(Set dst (ReplicateF (LoadF mem)));
3250   format %{ "pshufd  $dst,$mem,0x00\n\t"
3251             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3252   ins_encode %{
3253     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3254     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3255   %}
3256   ins_pipe( pipe_slow );
3257 %}
3258 
3259 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3260   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3261   match(Set dst (ReplicateF zero));
3262   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3263   ins_encode %{
3264     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3265   %}
3266   ins_pipe( fpu_reg_reg );
3267 %}
3268 
3269 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3270   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3271   match(Set dst (ReplicateF zero));
3272   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3273   ins_encode %{
3274     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3275   %}
3276   ins_pipe( fpu_reg_reg );
3277 %}
3278 
3279 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3280   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3281   match(Set dst (ReplicateF zero));
3282   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3283   ins_encode %{
3284     int vector_len = 1;
3285     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3286   %}
3287   ins_pipe( fpu_reg_reg );
3288 %}
3289 
3290 instruct Repl2D_mem(vecX dst, memory mem) %{
3291   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3292   match(Set dst (ReplicateD (LoadD mem)));
3293   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3294   ins_encode %{
3295     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3296   %}
3297   ins_pipe( pipe_slow );
3298 %}
3299 
3300 instruct Repl4D(vecY dst, regD src) %{
3301   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3302   match(Set dst (ReplicateD src));
3303   format %{ "pshufd  $dst,$src,0x44\n\t"
3304             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3305   ins_encode %{
3306     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3307     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3308   %}
3309   ins_pipe( pipe_slow );
3310 %}
3311 
3312 instruct Repl4D_mem(vecY dst, memory mem) %{
3313   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3314   match(Set dst (ReplicateD (LoadD mem)));
3315   format %{ "pshufd  $dst,$mem,0x44\n\t"
3316             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3317   ins_encode %{
3318     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3319     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3320   %}
3321   ins_pipe( pipe_slow );
3322 %}
3323 
3324 // Replicate double (8 byte) scalar zero to be vector
3325 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3326   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3327   match(Set dst (ReplicateD zero));
3328   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3329   ins_encode %{
3330     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3331   %}
3332   ins_pipe( fpu_reg_reg );
3333 %}
3334 
3335 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3336   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3337   match(Set dst (ReplicateD zero));
3338   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3339   ins_encode %{
3340     int vector_len = 1;
3341     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3342   %}
3343   ins_pipe( fpu_reg_reg );
3344 %}
3345 
3346 // ====================GENERIC REPLICATE==========================================
3347 
3348 // Replicate byte scalar to be vector
3349 instruct Repl4B(vecS dst, rRegI src) %{
3350   predicate(n->as_Vector()->length() == 4);
3351   match(Set dst (ReplicateB src));
3352   format %{ "movd    $dst,$src\n\t"
3353             "punpcklbw $dst,$dst\n\t"
3354             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3355   ins_encode %{
3356     __ movdl($dst$$XMMRegister, $src$$Register);
3357     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3358     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3359   %}
3360   ins_pipe( pipe_slow );
3361 %}
3362 
3363 instruct Repl8B(vecD dst, rRegI src) %{
3364   predicate(n->as_Vector()->length() == 8);
3365   match(Set dst (ReplicateB src));
3366   format %{ "movd    $dst,$src\n\t"
3367             "punpcklbw $dst,$dst\n\t"
3368             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3369   ins_encode %{
3370     __ movdl($dst$$XMMRegister, $src$$Register);
3371     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3372     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3373   %}
3374   ins_pipe( pipe_slow );
3375 %}
3376 
3377 // Replicate byte scalar immediate to be vector by loading from const table.
3378 instruct Repl4B_imm(vecS dst, immI con) %{
3379   predicate(n->as_Vector()->length() == 4);
3380   match(Set dst (ReplicateB con));
3381   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3382   ins_encode %{
3383     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3384   %}
3385   ins_pipe( pipe_slow );
3386 %}
3387 
3388 instruct Repl8B_imm(vecD dst, immI con) %{
3389   predicate(n->as_Vector()->length() == 8);
3390   match(Set dst (ReplicateB con));
3391   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3392   ins_encode %{
3393     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3394   %}
3395   ins_pipe( pipe_slow );
3396 %}
3397 
3398 // Replicate byte scalar zero to be vector
3399 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3400   predicate(n->as_Vector()->length() == 4);
3401   match(Set dst (ReplicateB zero));
3402   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3403   ins_encode %{
3404     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3405   %}
3406   ins_pipe( fpu_reg_reg );
3407 %}
3408 
3409 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3410   predicate(n->as_Vector()->length() == 8);
3411   match(Set dst (ReplicateB zero));
3412   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3413   ins_encode %{
3414     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3415   %}
3416   ins_pipe( fpu_reg_reg );
3417 %}
3418 
3419 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3420   predicate(n->as_Vector()->length() == 16);
3421   match(Set dst (ReplicateB zero));
3422   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3423   ins_encode %{
3424     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3425   %}
3426   ins_pipe( fpu_reg_reg );
3427 %}
3428 
3429 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3430   predicate(n->as_Vector()->length() == 32);
3431   match(Set dst (ReplicateB zero));
3432   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3433   ins_encode %{
3434     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3435     int vector_len = 1;
3436     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3437   %}
3438   ins_pipe( fpu_reg_reg );
3439 %}
3440 
3441 // Replicate char/short (2 byte) scalar to be vector
3442 instruct Repl2S(vecS dst, rRegI src) %{
3443   predicate(n->as_Vector()->length() == 2);
3444   match(Set dst (ReplicateS src));
3445   format %{ "movd    $dst,$src\n\t"
3446             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3447   ins_encode %{
3448     __ movdl($dst$$XMMRegister, $src$$Register);
3449     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3450   %}
3451   ins_pipe( fpu_reg_reg );
3452 %}
3453 
3454 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3455 instruct Repl2S_imm(vecS dst, immI con) %{
3456   predicate(n->as_Vector()->length() == 2);
3457   match(Set dst (ReplicateS con));
3458   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3459   ins_encode %{
3460     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3461   %}
3462   ins_pipe( fpu_reg_reg );
3463 %}
3464 
3465 instruct Repl4S_imm(vecD dst, immI con) %{
3466   predicate(n->as_Vector()->length() == 4);
3467   match(Set dst (ReplicateS con));
3468   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3469   ins_encode %{
3470     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3471   %}
3472   ins_pipe( fpu_reg_reg );
3473 %}
3474 
3475 // Replicate char/short (2 byte) scalar zero to be vector
3476 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3477   predicate(n->as_Vector()->length() == 2);
3478   match(Set dst (ReplicateS zero));
3479   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3480   ins_encode %{
3481     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3482   %}
3483   ins_pipe( fpu_reg_reg );
3484 %}
3485 
3486 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3487   predicate(n->as_Vector()->length() == 4);
3488   match(Set dst (ReplicateS zero));
3489   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3490   ins_encode %{
3491     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3492   %}
3493   ins_pipe( fpu_reg_reg );
3494 %}
3495 
3496 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3497   predicate(n->as_Vector()->length() == 8);
3498   match(Set dst (ReplicateS zero));
3499   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3500   ins_encode %{
3501     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3502   %}
3503   ins_pipe( fpu_reg_reg );
3504 %}
3505 
3506 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3507   predicate(n->as_Vector()->length() == 16);
3508   match(Set dst (ReplicateS zero));
3509   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3510   ins_encode %{
3511     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3512     int vector_len = 1;
3513     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3514   %}
3515   ins_pipe( fpu_reg_reg );
3516 %}
3517 
3518 // Replicate integer (4 byte) scalar to be vector
3519 instruct Repl2I(vecD dst, rRegI src) %{
3520   predicate(n->as_Vector()->length() == 2);
3521   match(Set dst (ReplicateI src));
3522   format %{ "movd    $dst,$src\n\t"
3523             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3524   ins_encode %{
3525     __ movdl($dst$$XMMRegister, $src$$Register);
3526     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3527   %}
3528   ins_pipe( fpu_reg_reg );
3529 %}
3530 
3531 // Integer could be loaded into xmm register directly from memory.
3532 instruct Repl2I_mem(vecD dst, memory mem) %{
3533   predicate(n->as_Vector()->length() == 2);
3534   match(Set dst (ReplicateI (LoadI mem)));
3535   format %{ "movd    $dst,$mem\n\t"
3536             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3537   ins_encode %{
3538     __ movdl($dst$$XMMRegister, $mem$$Address);
3539     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3540   %}
3541   ins_pipe( fpu_reg_reg );
3542 %}
3543 
3544 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3545 instruct Repl2I_imm(vecD dst, immI con) %{
3546   predicate(n->as_Vector()->length() == 2);
3547   match(Set dst (ReplicateI con));
3548   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3549   ins_encode %{
3550     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3551   %}
3552   ins_pipe( fpu_reg_reg );
3553 %}
3554 
3555 // Replicate integer (4 byte) scalar zero to be vector
3556 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3557   predicate(n->as_Vector()->length() == 2);
3558   match(Set dst (ReplicateI zero));
3559   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3560   ins_encode %{
3561     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3562   %}
3563   ins_pipe( fpu_reg_reg );
3564 %}
3565 
3566 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3567   predicate(n->as_Vector()->length() == 4);
3568   match(Set dst (ReplicateI zero));
3569   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3570   ins_encode %{
3571     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3572   %}
3573   ins_pipe( fpu_reg_reg );
3574 %}
3575 
3576 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3577   predicate(n->as_Vector()->length() == 8);
3578   match(Set dst (ReplicateI zero));
3579   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3580   ins_encode %{
3581     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3582     int vector_len = 1;
3583     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3584   %}
3585   ins_pipe( fpu_reg_reg );
3586 %}
3587 
3588 // Replicate long (8 byte) scalar to be vector
3589 #ifdef _LP64
3590 instruct Repl2L(vecX dst, rRegL src) %{
3591   predicate(n->as_Vector()->length() == 2);
3592   match(Set dst (ReplicateL src));
3593   format %{ "movdq   $dst,$src\n\t"
3594             "punpcklqdq $dst,$dst\t! replicate2L" %}
3595   ins_encode %{
3596     __ movdq($dst$$XMMRegister, $src$$Register);
3597     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3598   %}
3599   ins_pipe( pipe_slow );
3600 %}
3601 #else // _LP64
3602 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3603   predicate(n->as_Vector()->length() == 2);
3604   match(Set dst (ReplicateL src));
3605   effect(TEMP dst, USE src, TEMP tmp);
3606   format %{ "movdl   $dst,$src.lo\n\t"
3607             "movdl   $tmp,$src.hi\n\t"
3608             "punpckldq $dst,$tmp\n\t"
3609             "punpcklqdq $dst,$dst\t! replicate2L"%}
3610   ins_encode %{
3611     __ movdl($dst$$XMMRegister, $src$$Register);
3612     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3613     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3614     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3615   %}
3616   ins_pipe( pipe_slow );
3617 %}
3618 #endif // _LP64
3619 
3620 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3621 instruct Repl2L_imm(vecX dst, immL con) %{
3622   predicate(n->as_Vector()->length() == 2);
3623   match(Set dst (ReplicateL con));
3624   format %{ "movq    $dst,[$constantaddress]\n\t"
3625             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3626   ins_encode %{
3627     __ movq($dst$$XMMRegister, $constantaddress($con));
3628     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3629   %}
3630   ins_pipe( pipe_slow );
3631 %}
3632 
3633 // Replicate long (8 byte) scalar zero to be vector
3634 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3635   predicate(n->as_Vector()->length() == 2);
3636   match(Set dst (ReplicateL zero));
3637   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3638   ins_encode %{
3639     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3640   %}
3641   ins_pipe( fpu_reg_reg );
3642 %}
3643 
3644 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3645   predicate(n->as_Vector()->length() == 4);
3646   match(Set dst (ReplicateL zero));
3647   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3648   ins_encode %{
3649     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3650     int vector_len = 1;
3651     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3652   %}
3653   ins_pipe( fpu_reg_reg );
3654 %}
3655 
3656 // Replicate float (4 byte) scalar to be vector
3657 instruct Repl2F(vecD dst, regF src) %{
3658   predicate(n->as_Vector()->length() == 2);
3659   match(Set dst (ReplicateF src));
3660   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3661   ins_encode %{
3662     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3663   %}
3664   ins_pipe( fpu_reg_reg );
3665 %}
3666 
3667 instruct Repl4F(vecX dst, regF src) %{
3668   predicate(n->as_Vector()->length() == 4);
3669   match(Set dst (ReplicateF src));
3670   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3671   ins_encode %{
3672     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3673   %}
3674   ins_pipe( pipe_slow );
3675 %}
3676 
3677 // Replicate double (8 bytes) scalar to be vector
3678 instruct Repl2D(vecX dst, regD src) %{
3679   predicate(n->as_Vector()->length() == 2);
3680   match(Set dst (ReplicateD src));
3681   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3682   ins_encode %{
3683     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3684   %}
3685   ins_pipe( pipe_slow );
3686 %}
3687 
3688 // ====================EVEX REPLICATE=============================================
3689 
3690 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
3691   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3692   match(Set dst (ReplicateB (LoadB mem)));
3693   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
3694   ins_encode %{
3695     int vector_len = 0;
3696     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3697   %}
3698   ins_pipe( pipe_slow );
3699 %}
3700 
3701 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
3702   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3703   match(Set dst (ReplicateB (LoadB mem)));
3704   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
3705   ins_encode %{
3706     int vector_len = 0;
3707     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3708   %}
3709   ins_pipe( pipe_slow );
3710 %}
3711 
3712 instruct Repl16B_evex(vecX dst, rRegI src) %{
3713   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3714   match(Set dst (ReplicateB src));
3715   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
3716   ins_encode %{
3717    int vector_len = 0;
3718     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3719   %}
3720   ins_pipe( pipe_slow );
3721 %}
3722 
3723 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
3724   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3725   match(Set dst (ReplicateB (LoadB mem)));
3726   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
3727   ins_encode %{
3728     int vector_len = 0;
3729     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3730   %}
3731   ins_pipe( pipe_slow );
3732 %}
3733 
3734 instruct Repl32B_evex(vecY dst, rRegI src) %{
3735   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3736   match(Set dst (ReplicateB src));
3737   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
3738   ins_encode %{
3739    int vector_len = 1;
3740     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3741   %}
3742   ins_pipe( pipe_slow );
3743 %}
3744 
3745 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
3746   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3747   match(Set dst (ReplicateB (LoadB mem)));
3748   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
3749   ins_encode %{
3750     int vector_len = 1;
3751     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3752   %}
3753   ins_pipe( pipe_slow );
3754 %}
3755 
3756 instruct Repl64B_evex(vecZ dst, rRegI src) %{
3757   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3758   match(Set dst (ReplicateB src));
3759   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
3760   ins_encode %{
3761    int vector_len = 2;
3762     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3763   %}
3764   ins_pipe( pipe_slow );
3765 %}
3766 
3767 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
3768   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3769   match(Set dst (ReplicateB (LoadB mem)));
3770   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
3771   ins_encode %{
3772     int vector_len = 2;
3773     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3774   %}
3775   ins_pipe( pipe_slow );
3776 %}
3777 
3778 instruct Repl16B_imm_evex(vecX dst, immI con) %{
3779   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3780   match(Set dst (ReplicateB con));
3781   format %{ "movq    $dst,[$constantaddress]\n\t"
3782             "vpbroadcastb $dst,$dst\t! replicate16B" %}
3783   ins_encode %{
3784    int vector_len = 0;
3785     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3786     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3787   %}
3788   ins_pipe( pipe_slow );
3789 %}
3790 
3791 instruct Repl32B_imm_evex(vecY dst, immI con) %{
3792   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3793   match(Set dst (ReplicateB con));
3794   format %{ "movq    $dst,[$constantaddress]\n\t"
3795             "vpbroadcastb $dst,$dst\t! replicate32B" %}
3796   ins_encode %{
3797    int vector_len = 1;
3798     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3799     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3800   %}
3801   ins_pipe( pipe_slow );
3802 %}
3803 
3804 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
3805   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3806   match(Set dst (ReplicateB con));
3807   format %{ "movq    $dst,[$constantaddress]\n\t"
3808             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
3809   ins_encode %{
3810    int vector_len = 2;
3811     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3812     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3813   %}
3814   ins_pipe( pipe_slow );
3815 %}
3816 
3817 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
3818   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3819   match(Set dst (ReplicateB zero));
3820   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3821   ins_encode %{
3822     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3823     int vector_len = 2;
3824     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3825   %}
3826   ins_pipe( fpu_reg_reg );
3827 %}
3828 
3829 instruct Repl4S_evex(vecD dst, rRegI src) %{
3830   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3831   match(Set dst (ReplicateS src));
3832   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
3833   ins_encode %{
3834    int vector_len = 0;
3835     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3836   %}
3837   ins_pipe( pipe_slow );
3838 %}
3839 
3840 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
3841   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3842   match(Set dst (ReplicateS (LoadS mem)));
3843   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
3844   ins_encode %{
3845     int vector_len = 0;
3846     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3847   %}
3848   ins_pipe( pipe_slow );
3849 %}
3850 
3851 instruct Repl8S_evex(vecX dst, rRegI src) %{
3852   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3853   match(Set dst (ReplicateS src));
3854   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
3855   ins_encode %{
3856    int vector_len = 0;
3857     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3858   %}
3859   ins_pipe( pipe_slow );
3860 %}
3861 
3862 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
3863   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3864   match(Set dst (ReplicateS (LoadS mem)));
3865   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
3866   ins_encode %{
3867     int vector_len = 0;
3868     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3869   %}
3870   ins_pipe( pipe_slow );
3871 %}
3872 
3873 instruct Repl16S_evex(vecY dst, rRegI src) %{
3874   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3875   match(Set dst (ReplicateS src));
3876   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
3877   ins_encode %{
3878    int vector_len = 1;
3879     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3880   %}
3881   ins_pipe( pipe_slow );
3882 %}
3883 
3884 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
3885   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3886   match(Set dst (ReplicateS (LoadS mem)));
3887   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
3888   ins_encode %{
3889     int vector_len = 1;
3890     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3891   %}
3892   ins_pipe( pipe_slow );
3893 %}
3894 
3895 instruct Repl32S_evex(vecZ dst, rRegI src) %{
3896   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3897   match(Set dst (ReplicateS src));
3898   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
3899   ins_encode %{
3900    int vector_len = 2;
3901     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3902   %}
3903   ins_pipe( pipe_slow );
3904 %}
3905 
3906 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
3907   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3908   match(Set dst (ReplicateS (LoadS mem)));
3909   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
3910   ins_encode %{
3911     int vector_len = 2;
3912     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3913   %}
3914   ins_pipe( pipe_slow );
3915 %}
3916 
3917 instruct Repl8S_imm_evex(vecX dst, immI con) %{
3918   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3919   match(Set dst (ReplicateS con));
3920   format %{ "movq    $dst,[$constantaddress]\n\t"
3921             "vpbroadcastw $dst,$dst\t! replicate8S" %}
3922   ins_encode %{
3923    int vector_len = 0;
3924     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3925     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3926   %}
3927   ins_pipe( pipe_slow );
3928 %}
3929 
3930 instruct Repl16S_imm_evex(vecY dst, immI con) %{
3931   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3932   match(Set dst (ReplicateS con));
3933   format %{ "movq    $dst,[$constantaddress]\n\t"
3934             "vpbroadcastw $dst,$dst\t! replicate16S" %}
3935   ins_encode %{
3936    int vector_len = 1;
3937     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3938     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3939   %}
3940   ins_pipe( pipe_slow );
3941 %}
3942 
3943 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
3944   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3945   match(Set dst (ReplicateS con));
3946   format %{ "movq    $dst,[$constantaddress]\n\t"
3947             "vpbroadcastw $dst,$dst\t! replicate32S" %}
3948   ins_encode %{
3949    int vector_len = 2;
3950     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3951     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3952   %}
3953   ins_pipe( pipe_slow );
3954 %}
3955 
3956 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
3957   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3958   match(Set dst (ReplicateS zero));
3959   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
3960   ins_encode %{
3961     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3962     int vector_len = 2;
3963     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3964   %}
3965   ins_pipe( fpu_reg_reg );
3966 %}
3967 
3968 instruct Repl4I_evex(vecX dst, rRegI src) %{
3969   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3970   match(Set dst (ReplicateI src));
3971   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
3972   ins_encode %{
3973     int vector_len = 0;
3974     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3975   %}
3976   ins_pipe( pipe_slow );
3977 %}
3978 
3979 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
3980   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3981   match(Set dst (ReplicateI (LoadI mem)));
3982   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
3983   ins_encode %{
3984     int vector_len = 0;
3985     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3986   %}
3987   ins_pipe( pipe_slow );
3988 %}
3989 
3990 instruct Repl8I_evex(vecY dst, rRegI src) %{
3991   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
3992   match(Set dst (ReplicateI src));
3993   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
3994   ins_encode %{
3995     int vector_len = 1;
3996     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3997   %}
3998   ins_pipe( pipe_slow );
3999 %}
4000 
4001 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4002   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4003   match(Set dst (ReplicateI (LoadI mem)));
4004   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4005   ins_encode %{
4006     int vector_len = 1;
4007     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4008   %}
4009   ins_pipe( pipe_slow );
4010 %}
4011 
4012 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4013   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4014   match(Set dst (ReplicateI src));
4015   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4016   ins_encode %{
4017     int vector_len = 2;
4018     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4019   %}
4020   ins_pipe( pipe_slow );
4021 %}
4022 
4023 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4024   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4025   match(Set dst (ReplicateI (LoadI mem)));
4026   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4027   ins_encode %{
4028     int vector_len = 2;
4029     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4030   %}
4031   ins_pipe( pipe_slow );
4032 %}
4033 
4034 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4035   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4036   match(Set dst (ReplicateI con));
4037   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4038             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4039   ins_encode %{
4040     int vector_len = 0;
4041     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4042     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4043   %}
4044   ins_pipe( pipe_slow );
4045 %}
4046 
4047 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4048   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4049   match(Set dst (ReplicateI con));
4050   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4051             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4052   ins_encode %{
4053     int vector_len = 1;
4054     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4055     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4056   %}
4057   ins_pipe( pipe_slow );
4058 %}
4059 
4060 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4061   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4062   match(Set dst (ReplicateI con));
4063   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4064             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4065   ins_encode %{
4066     int vector_len = 2;
4067     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4068     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4069   %}
4070   ins_pipe( pipe_slow );
4071 %}
4072 
4073 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4074   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4075   match(Set dst (ReplicateI zero));
4076   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4077   ins_encode %{
4078     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4079     int vector_len = 2;
4080     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4081   %}
4082   ins_pipe( fpu_reg_reg );
4083 %}
4084 
4085 // Replicate long (8 byte) scalar to be vector
4086 #ifdef _LP64
4087 instruct Repl4L_evex(vecY dst, rRegL src) %{
4088   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4089   match(Set dst (ReplicateL src));
4090   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4091   ins_encode %{
4092     int vector_len = 1;
4093     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4094   %}
4095   ins_pipe( pipe_slow );
4096 %}
4097 
4098 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4099   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4100   match(Set dst (ReplicateL src));
4101   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4102   ins_encode %{
4103     int vector_len = 2;
4104     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4105   %}
4106   ins_pipe( pipe_slow );
4107 %}
4108 #else // _LP64
4109 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4110   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4111   match(Set dst (ReplicateL src));
4112   effect(TEMP dst, USE src, TEMP tmp);
4113   format %{ "movdl   $dst,$src.lo\n\t"
4114             "movdl   $tmp,$src.hi\n\t"
4115             "punpckldq $dst,$tmp\n\t"
4116             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4117   ins_encode %{
4118     int vector_len = 1;
4119     __ movdl($dst$$XMMRegister, $src$$Register);
4120     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4121     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4122     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4123   %}
4124   ins_pipe( pipe_slow );
4125 %}
4126 
4127 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4128   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4129   match(Set dst (ReplicateL src));
4130   effect(TEMP dst, USE src, TEMP tmp);
4131   format %{ "movdl   $dst,$src.lo\n\t"
4132             "movdl   $tmp,$src.hi\n\t"
4133             "punpckldq $dst,$tmp\n\t"
4134             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4135   ins_encode %{
4136     int vector_len = 2;
4137     __ movdl($dst$$XMMRegister, $src$$Register);
4138     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4139     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4140     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4141   %}
4142   ins_pipe( pipe_slow );
4143 %}
4144 #endif // _LP64
4145 
4146 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4147   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4148   match(Set dst (ReplicateL con));
4149   format %{ "movq    $dst,[$constantaddress]\n\t"
4150             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4151   ins_encode %{
4152     int vector_len = 1;
4153     __ movq($dst$$XMMRegister, $constantaddress($con));
4154     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4155   %}
4156   ins_pipe( pipe_slow );
4157 %}
4158 
4159 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4160   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4161   match(Set dst (ReplicateL con));
4162   format %{ "movq    $dst,[$constantaddress]\n\t"
4163             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4164   ins_encode %{
4165     int vector_len = 2;
4166     __ movq($dst$$XMMRegister, $constantaddress($con));
4167     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4168   %}
4169   ins_pipe( pipe_slow );
4170 %}
4171 
4172 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4173   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4174   match(Set dst (ReplicateL (LoadL mem)));
4175   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4176   ins_encode %{
4177     int vector_len = 0;
4178     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4179   %}
4180   ins_pipe( pipe_slow );
4181 %}
4182 
4183 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4184   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4185   match(Set dst (ReplicateL (LoadL mem)));
4186   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4187   ins_encode %{
4188     int vector_len = 1;
4189     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4190   %}
4191   ins_pipe( pipe_slow );
4192 %}
4193 
4194 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4195   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4196   match(Set dst (ReplicateL (LoadL mem)));
4197   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4198   ins_encode %{
4199     int vector_len = 2;
4200     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4201   %}
4202   ins_pipe( pipe_slow );
4203 %}
4204 
4205 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4206   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4207   match(Set dst (ReplicateL zero));
4208   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4209   ins_encode %{
4210     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4211     int vector_len = 2;
4212     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4213   %}
4214   ins_pipe( fpu_reg_reg );
4215 %}
4216 
4217 instruct Repl8F_evex(vecY dst, regF src) %{
4218   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4219   match(Set dst (ReplicateF src));
4220   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4221   ins_encode %{
4222     int vector_len = 1;
4223     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4224   %}
4225   ins_pipe( pipe_slow );
4226 %}
4227 
4228 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4229   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4230   match(Set dst (ReplicateF (LoadF mem)));
4231   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4232   ins_encode %{
4233     int vector_len = 1;
4234     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4235   %}
4236   ins_pipe( pipe_slow );
4237 %}
4238 
4239 instruct Repl16F_evex(vecZ dst, regF src) %{
4240   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4241   match(Set dst (ReplicateF src));
4242   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
4243   ins_encode %{
4244     int vector_len = 2;
4245     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4246   %}
4247   ins_pipe( pipe_slow );
4248 %}
4249 
4250 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4251   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4252   match(Set dst (ReplicateF (LoadF mem)));
4253   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4254   ins_encode %{
4255     int vector_len = 2;
4256     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4257   %}
4258   ins_pipe( pipe_slow );
4259 %}
4260 
4261 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4262   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4263   match(Set dst (ReplicateF zero));
4264   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4265   ins_encode %{
4266     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4267     int vector_len = 2;
4268     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4269   %}
4270   ins_pipe( fpu_reg_reg );
4271 %}
4272 
4273 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4274   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4275   match(Set dst (ReplicateF zero));
4276   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4277   ins_encode %{
4278     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4279     int vector_len = 2;
4280     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4281   %}
4282   ins_pipe( fpu_reg_reg );
4283 %}
4284 
4285 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4286   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4287   match(Set dst (ReplicateF zero));
4288   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4289   ins_encode %{
4290     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4291     int vector_len = 2;
4292     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4293   %}
4294   ins_pipe( fpu_reg_reg );
4295 %}
4296 
4297 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4298   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4299   match(Set dst (ReplicateF zero));
4300   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4301   ins_encode %{
4302     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4303     int vector_len = 2;
4304     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4305   %}
4306   ins_pipe( fpu_reg_reg );
4307 %}
4308 
4309 instruct Repl4D_evex(vecY dst, regD src) %{
4310   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4311   match(Set dst (ReplicateD src));
4312   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4313   ins_encode %{
4314     int vector_len = 1;
4315     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4316   %}
4317   ins_pipe( pipe_slow );
4318 %}
4319 
4320 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4321   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4322   match(Set dst (ReplicateD (LoadD mem)));
4323   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4324   ins_encode %{
4325     int vector_len = 1;
4326     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4327   %}
4328   ins_pipe( pipe_slow );
4329 %}
4330 
4331 instruct Repl8D_evex(vecZ dst, regD src) %{
4332   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4333   match(Set dst (ReplicateD src));
4334   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
4335   ins_encode %{
4336     int vector_len = 2;
4337     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4338   %}
4339   ins_pipe( pipe_slow );
4340 %}
4341 
4342 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4343   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4344   match(Set dst (ReplicateD (LoadD mem)));
4345   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4346   ins_encode %{
4347     int vector_len = 2;
4348     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4349   %}
4350   ins_pipe( pipe_slow );
4351 %}
4352 
4353 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4354   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4355   match(Set dst (ReplicateD zero));
4356   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4357   ins_encode %{
4358     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4359     int vector_len = 2;
4360     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4361   %}
4362   ins_pipe( fpu_reg_reg );
4363 %}
4364 
4365 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4366   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4367   match(Set dst (ReplicateD zero));
4368   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4369   ins_encode %{
4370     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4371     int vector_len = 2;
4372     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4373   %}
4374   ins_pipe( fpu_reg_reg );
4375 %}
4376 
4377 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4378   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4379   match(Set dst (ReplicateD zero));
4380   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4381   ins_encode %{
4382     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4383     int vector_len = 2;
4384     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4385   %}
4386   ins_pipe( fpu_reg_reg );
4387 %}
4388 
4389 // ====================REDUCTION ARITHMETIC=======================================
4390 
4391 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4392   predicate(UseSSE > 2 && UseAVX == 0);
4393   match(Set dst (AddReductionVI src1 src2));
4394   effect(TEMP tmp2, TEMP tmp);
4395   format %{ "movdqu  $tmp2,$src2\n\t"
4396             "phaddd  $tmp2,$tmp2\n\t"
4397             "movd    $tmp,$src1\n\t"
4398             "paddd   $tmp,$tmp2\n\t"
4399             "movd    $dst,$tmp\t! add reduction2I" %}
4400   ins_encode %{
4401     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4402     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4403     __ movdl($tmp$$XMMRegister, $src1$$Register);
4404     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4405     __ movdl($dst$$Register, $tmp$$XMMRegister);
4406   %}
4407   ins_pipe( pipe_slow );
4408 %}
4409 
4410 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4411   predicate(VM_Version::supports_avxonly());
4412   match(Set dst (AddReductionVI src1 src2));
4413   effect(TEMP tmp, TEMP tmp2);
4414   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4415             "movd     $tmp2,$src1\n\t"
4416             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4417             "movd     $dst,$tmp2\t! add reduction2I" %}
4418   ins_encode %{
4419     int vector_len = 0;
4420     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4421     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4422     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4423     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4424   %}
4425   ins_pipe( pipe_slow );
4426 %}
4427 
4428 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4429   predicate(UseAVX > 2);
4430   match(Set dst (AddReductionVI src1 src2));
4431   effect(TEMP tmp, TEMP tmp2);
4432   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4433             "vpaddd  $tmp,$src2,$tmp2\n\t"
4434             "movd    $tmp2,$src1\n\t"
4435             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4436             "movd    $dst,$tmp2\t! add reduction2I" %}
4437   ins_encode %{
4438     int vector_len = 0;
4439     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4440     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4441     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4442     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4443     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4444   %}
4445   ins_pipe( pipe_slow );
4446 %}
4447 
4448 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4449   predicate(UseSSE > 2 && UseAVX == 0);
4450   match(Set dst (AddReductionVI src1 src2));
4451   effect(TEMP tmp, TEMP tmp2);
4452   format %{ "movdqu  $tmp,$src2\n\t"
4453             "phaddd  $tmp,$tmp\n\t"
4454             "phaddd  $tmp,$tmp\n\t"
4455             "movd    $tmp2,$src1\n\t"
4456             "paddd   $tmp2,$tmp\n\t"
4457             "movd    $dst,$tmp2\t! add reduction4I" %}
4458   ins_encode %{
4459     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4460     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4461     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4462     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4463     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4464     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4465   %}
4466   ins_pipe( pipe_slow );
4467 %}
4468 
4469 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4470   predicate(VM_Version::supports_avxonly());
4471   match(Set dst (AddReductionVI src1 src2));
4472   effect(TEMP tmp, TEMP tmp2);
4473   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4474             "vphaddd  $tmp,$tmp,$tmp\n\t"
4475             "movd     $tmp2,$src1\n\t"
4476             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4477             "movd     $dst,$tmp2\t! add reduction4I" %}
4478   ins_encode %{
4479     int vector_len = 0;
4480     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4481     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4482     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4483     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4484     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4485   %}
4486   ins_pipe( pipe_slow );
4487 %}
4488 
4489 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4490   predicate(UseAVX > 2);
4491   match(Set dst (AddReductionVI src1 src2));
4492   effect(TEMP tmp, TEMP tmp2);
4493   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4494             "vpaddd  $tmp,$src2,$tmp2\n\t"
4495             "pshufd  $tmp2,$tmp,0x1\n\t"
4496             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4497             "movd    $tmp2,$src1\n\t"
4498             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4499             "movd    $dst,$tmp2\t! add reduction4I" %}
4500   ins_encode %{
4501     int vector_len = 0;
4502     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4503     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4504     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4505     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4506     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4507     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4508     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4509   %}
4510   ins_pipe( pipe_slow );
4511 %}
4512 
4513 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4514   predicate(VM_Version::supports_avxonly());
4515   match(Set dst (AddReductionVI src1 src2));
4516   effect(TEMP tmp, TEMP tmp2);
4517   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4518             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4519             "vextracti128_high  $tmp2,$tmp\n\t"
4520             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4521             "movd     $tmp2,$src1\n\t"
4522             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4523             "movd     $dst,$tmp2\t! add reduction8I" %}
4524   ins_encode %{
4525     int vector_len = 1;
4526     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4527     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4528     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4529     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4530     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4531     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4532     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4533   %}
4534   ins_pipe( pipe_slow );
4535 %}
4536 
4537 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4538   predicate(UseAVX > 2);
4539   match(Set dst (AddReductionVI src1 src2));
4540   effect(TEMP tmp, TEMP tmp2);
4541   format %{ "vextracti128_high  $tmp,$src2\n\t"
4542             "vpaddd  $tmp,$tmp,$src2\n\t"
4543             "pshufd  $tmp2,$tmp,0xE\n\t"
4544             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4545             "pshufd  $tmp2,$tmp,0x1\n\t"
4546             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4547             "movd    $tmp2,$src1\n\t"
4548             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4549             "movd    $dst,$tmp2\t! add reduction8I" %}
4550   ins_encode %{
4551     int vector_len = 0;
4552     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4553     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4554     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4555     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4556     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4557     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4558     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4559     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4560     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4561   %}
4562   ins_pipe( pipe_slow );
4563 %}
4564 
4565 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4566   predicate(UseAVX > 2);
4567   match(Set dst (AddReductionVI src1 src2));
4568   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4569   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4570             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4571             "vextracti128_high  $tmp,$tmp3\n\t"
4572             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4573             "pshufd  $tmp2,$tmp,0xE\n\t"
4574             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4575             "pshufd  $tmp2,$tmp,0x1\n\t"
4576             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4577             "movd    $tmp2,$src1\n\t"
4578             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4579             "movd    $dst,$tmp2\t! mul reduction16I" %}
4580   ins_encode %{
4581     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4582     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4583     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4584     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4585     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4586     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4587     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4588     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4589     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4590     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4591     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4592   %}
4593   ins_pipe( pipe_slow );
4594 %}
4595 
4596 #ifdef _LP64
4597 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4598   predicate(UseAVX > 2);
4599   match(Set dst (AddReductionVL src1 src2));
4600   effect(TEMP tmp, TEMP tmp2);
4601   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4602             "vpaddq  $tmp,$src2,$tmp2\n\t"
4603             "movdq   $tmp2,$src1\n\t"
4604             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4605             "movdq   $dst,$tmp2\t! add reduction2L" %}
4606   ins_encode %{
4607     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4608     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4609     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4610     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4611     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4612   %}
4613   ins_pipe( pipe_slow );
4614 %}
4615 
4616 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4617   predicate(UseAVX > 2);
4618   match(Set dst (AddReductionVL src1 src2));
4619   effect(TEMP tmp, TEMP tmp2);
4620   format %{ "vextracti128_high  $tmp,$src2\n\t"
4621             "vpaddq  $tmp2,$tmp,$src2\n\t"
4622             "pshufd  $tmp,$tmp2,0xE\n\t"
4623             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4624             "movdq   $tmp,$src1\n\t"
4625             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4626             "movdq   $dst,$tmp2\t! add reduction4L" %}
4627   ins_encode %{
4628     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4629     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4630     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4631     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4632     __ movdq($tmp$$XMMRegister, $src1$$Register);
4633     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4634     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4635   %}
4636   ins_pipe( pipe_slow );
4637 %}
4638 
4639 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4640   predicate(UseAVX > 2);
4641   match(Set dst (AddReductionVL src1 src2));
4642   effect(TEMP tmp, TEMP tmp2);
4643   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
4644             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4645             "vextracti128_high  $tmp,$tmp2\n\t"
4646             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4647             "pshufd  $tmp,$tmp2,0xE\n\t"
4648             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4649             "movdq   $tmp,$src1\n\t"
4650             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4651             "movdq   $dst,$tmp2\t! add reduction8L" %}
4652   ins_encode %{
4653     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4654     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4655     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4656     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4657     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4658     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4659     __ movdq($tmp$$XMMRegister, $src1$$Register);
4660     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4661     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4662   %}
4663   ins_pipe( pipe_slow );
4664 %}
4665 #endif
4666 
4667 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4668   predicate(UseSSE >= 1 && UseAVX == 0);
4669   match(Set dst (AddReductionVF dst src2));
4670   effect(TEMP dst, TEMP tmp);
4671   format %{ "addss   $dst,$src2\n\t"
4672             "pshufd  $tmp,$src2,0x01\n\t"
4673             "addss   $dst,$tmp\t! add reduction2F" %}
4674   ins_encode %{
4675     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4676     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4677     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4678   %}
4679   ins_pipe( pipe_slow );
4680 %}
4681 
4682 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4683   predicate(UseAVX > 0);
4684   match(Set dst (AddReductionVF dst src2));
4685   effect(TEMP dst, TEMP tmp);
4686   format %{ "vaddss  $dst,$dst,$src2\n\t"
4687             "pshufd  $tmp,$src2,0x01\n\t"
4688             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4689   ins_encode %{
4690     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4691     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4692     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4693   %}
4694   ins_pipe( pipe_slow );
4695 %}
4696 
4697 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4698   predicate(UseSSE >= 1 && UseAVX == 0);
4699   match(Set dst (AddReductionVF dst src2));
4700   effect(TEMP dst, TEMP tmp);
4701   format %{ "addss   $dst,$src2\n\t"
4702             "pshufd  $tmp,$src2,0x01\n\t"
4703             "addss   $dst,$tmp\n\t"
4704             "pshufd  $tmp,$src2,0x02\n\t"
4705             "addss   $dst,$tmp\n\t"
4706             "pshufd  $tmp,$src2,0x03\n\t"
4707             "addss   $dst,$tmp\t! add reduction4F" %}
4708   ins_encode %{
4709     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4710     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4711     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4712     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4713     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4714     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4715     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4716   %}
4717   ins_pipe( pipe_slow );
4718 %}
4719 
4720 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4721   predicate(UseAVX > 0);
4722   match(Set dst (AddReductionVF dst src2));
4723   effect(TEMP tmp, TEMP dst);
4724   format %{ "vaddss  $dst,dst,$src2\n\t"
4725             "pshufd  $tmp,$src2,0x01\n\t"
4726             "vaddss  $dst,$dst,$tmp\n\t"
4727             "pshufd  $tmp,$src2,0x02\n\t"
4728             "vaddss  $dst,$dst,$tmp\n\t"
4729             "pshufd  $tmp,$src2,0x03\n\t"
4730             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
4731   ins_encode %{
4732     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4733     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4734     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4735     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4736     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4737     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4738     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4739   %}
4740   ins_pipe( pipe_slow );
4741 %}
4742 
4743 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
4744   predicate(UseAVX > 0);
4745   match(Set dst (AddReductionVF dst src2));
4746   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4747   format %{ "vaddss  $dst,$dst,$src2\n\t"
4748             "pshufd  $tmp,$src2,0x01\n\t"
4749             "vaddss  $dst,$dst,$tmp\n\t"
4750             "pshufd  $tmp,$src2,0x02\n\t"
4751             "vaddss  $dst,$dst,$tmp\n\t"
4752             "pshufd  $tmp,$src2,0x03\n\t"
4753             "vaddss  $dst,$dst,$tmp\n\t"
4754             "vextractf128_high  $tmp2,$src2\n\t"
4755             "vaddss  $dst,$dst,$tmp2\n\t"
4756             "pshufd  $tmp,$tmp2,0x01\n\t"
4757             "vaddss  $dst,$dst,$tmp\n\t"
4758             "pshufd  $tmp,$tmp2,0x02\n\t"
4759             "vaddss  $dst,$dst,$tmp\n\t"
4760             "pshufd  $tmp,$tmp2,0x03\n\t"
4761             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
4762   ins_encode %{
4763     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4764     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4765     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4766     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4767     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4768     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4769     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4770     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4771     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4772     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4773     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4774     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4775     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4776     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4777     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4778   %}
4779   ins_pipe( pipe_slow );
4780 %}
4781 
4782 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
4783   predicate(UseAVX > 2);
4784   match(Set dst (AddReductionVF dst src2));
4785   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4786   format %{ "vaddss  $dst,$dst,$src2\n\t"
4787             "pshufd  $tmp,$src2,0x01\n\t"
4788             "vaddss  $dst,$dst,$tmp\n\t"
4789             "pshufd  $tmp,$src2,0x02\n\t"
4790             "vaddss  $dst,$dst,$tmp\n\t"
4791             "pshufd  $tmp,$src2,0x03\n\t"
4792             "vaddss  $dst,$dst,$tmp\n\t"
4793             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4794             "vaddss  $dst,$dst,$tmp2\n\t"
4795             "pshufd  $tmp,$tmp2,0x01\n\t"
4796             "vaddss  $dst,$dst,$tmp\n\t"
4797             "pshufd  $tmp,$tmp2,0x02\n\t"
4798             "vaddss  $dst,$dst,$tmp\n\t"
4799             "pshufd  $tmp,$tmp2,0x03\n\t"
4800             "vaddss  $dst,$dst,$tmp\n\t"
4801             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4802             "vaddss  $dst,$dst,$tmp2\n\t"
4803             "pshufd  $tmp,$tmp2,0x01\n\t"
4804             "vaddss  $dst,$dst,$tmp\n\t"
4805             "pshufd  $tmp,$tmp2,0x02\n\t"
4806             "vaddss  $dst,$dst,$tmp\n\t"
4807             "pshufd  $tmp,$tmp2,0x03\n\t"
4808             "vaddss  $dst,$dst,$tmp\n\t"
4809             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4810             "vaddss  $dst,$dst,$tmp2\n\t"
4811             "pshufd  $tmp,$tmp2,0x01\n\t"
4812             "vaddss  $dst,$dst,$tmp\n\t"
4813             "pshufd  $tmp,$tmp2,0x02\n\t"
4814             "vaddss  $dst,$dst,$tmp\n\t"
4815             "pshufd  $tmp,$tmp2,0x03\n\t"
4816             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
4817   ins_encode %{
4818     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4819     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4820     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4821     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4822     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4823     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4824     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4825     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4826     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4827     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4828     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4829     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4830     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4831     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4832     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4833     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4834     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4835     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4836     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4837     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4838     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4839     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4840     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4841     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4842     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4843     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4844     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4845     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4846     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4847     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4848     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4849   %}
4850   ins_pipe( pipe_slow );
4851 %}
4852 
4853 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4854   predicate(UseSSE >= 1 && UseAVX == 0);
4855   match(Set dst (AddReductionVD dst src2));
4856   effect(TEMP tmp, TEMP dst);
4857   format %{ "addsd   $dst,$src2\n\t"
4858             "pshufd  $tmp,$src2,0xE\n\t"
4859             "addsd   $dst,$tmp\t! add reduction2D" %}
4860   ins_encode %{
4861     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
4862     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4863     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
4864   %}
4865   ins_pipe( pipe_slow );
4866 %}
4867 
4868 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4869   predicate(UseAVX > 0);
4870   match(Set dst (AddReductionVD dst src2));
4871   effect(TEMP tmp, TEMP dst);
4872   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4873             "pshufd  $tmp,$src2,0xE\n\t"
4874             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
4875   ins_encode %{
4876     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4877     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4878     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4879   %}
4880   ins_pipe( pipe_slow );
4881 %}
4882 
4883 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
4884   predicate(UseAVX > 0);
4885   match(Set dst (AddReductionVD dst src2));
4886   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4887   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4888             "pshufd  $tmp,$src2,0xE\n\t"
4889             "vaddsd  $dst,$dst,$tmp\n\t"
4890             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4891             "vaddsd  $dst,$dst,$tmp2\n\t"
4892             "pshufd  $tmp,$tmp2,0xE\n\t"
4893             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
4894   ins_encode %{
4895     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4896     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4897     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4898     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4899     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4900     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4901     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4902   %}
4903   ins_pipe( pipe_slow );
4904 %}
4905 
4906 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
4907   predicate(UseAVX > 2);
4908   match(Set dst (AddReductionVD dst src2));
4909   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4910   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4911             "pshufd  $tmp,$src2,0xE\n\t"
4912             "vaddsd  $dst,$dst,$tmp\n\t"
4913             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4914             "vaddsd  $dst,$dst,$tmp2\n\t"
4915             "pshufd  $tmp,$tmp2,0xE\n\t"
4916             "vaddsd  $dst,$dst,$tmp\n\t"
4917             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4918             "vaddsd  $dst,$dst,$tmp2\n\t"
4919             "pshufd  $tmp,$tmp2,0xE\n\t"
4920             "vaddsd  $dst,$dst,$tmp\n\t"
4921             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4922             "vaddsd  $dst,$dst,$tmp2\n\t"
4923             "pshufd  $tmp,$tmp2,0xE\n\t"
4924             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
4925   ins_encode %{
4926     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4927     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4928     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4929     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4930     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4931     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4932     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4933     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4934     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4935     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4936     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4937     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4938     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4939     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4940     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4941   %}
4942   ins_pipe( pipe_slow );
4943 %}
4944 
4945 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4946   predicate(UseSSE > 3 && UseAVX == 0);
4947   match(Set dst (MulReductionVI src1 src2));
4948   effect(TEMP tmp, TEMP tmp2);
4949   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4950             "pmulld  $tmp2,$src2\n\t"
4951             "movd    $tmp,$src1\n\t"
4952             "pmulld  $tmp2,$tmp\n\t"
4953             "movd    $dst,$tmp2\t! mul reduction2I" %}
4954   ins_encode %{
4955     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4956     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
4957     __ movdl($tmp$$XMMRegister, $src1$$Register);
4958     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4959     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4965   predicate(UseAVX > 0);
4966   match(Set dst (MulReductionVI src1 src2));
4967   effect(TEMP tmp, TEMP tmp2);
4968   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
4969             "vpmulld  $tmp,$src2,$tmp2\n\t"
4970             "movd     $tmp2,$src1\n\t"
4971             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
4972             "movd     $dst,$tmp2\t! mul reduction2I" %}
4973   ins_encode %{
4974     int vector_len = 0;
4975     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4976     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4977     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4978     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4979     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4980   %}
4981   ins_pipe( pipe_slow );
4982 %}
4983 
4984 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4985   predicate(UseSSE > 3 && UseAVX == 0);
4986   match(Set dst (MulReductionVI src1 src2));
4987   effect(TEMP tmp, TEMP tmp2);
4988   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4989             "pmulld  $tmp2,$src2\n\t"
4990             "pshufd  $tmp,$tmp2,0x1\n\t"
4991             "pmulld  $tmp2,$tmp\n\t"
4992             "movd    $tmp,$src1\n\t"
4993             "pmulld  $tmp2,$tmp\n\t"
4994             "movd    $dst,$tmp2\t! mul reduction4I" %}
4995   ins_encode %{
4996     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4997     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
4998     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
4999     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5000     __ movdl($tmp$$XMMRegister, $src1$$Register);
5001     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5002     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5003   %}
5004   ins_pipe( pipe_slow );
5005 %}
5006 
5007 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5008   predicate(UseAVX > 0);
5009   match(Set dst (MulReductionVI src1 src2));
5010   effect(TEMP tmp, TEMP tmp2);
5011   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5012             "vpmulld  $tmp,$src2,$tmp2\n\t"
5013             "pshufd   $tmp2,$tmp,0x1\n\t"
5014             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5015             "movd     $tmp2,$src1\n\t"
5016             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5017             "movd     $dst,$tmp2\t! mul reduction4I" %}
5018   ins_encode %{
5019     int vector_len = 0;
5020     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5021     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5022     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5023     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5024     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5025     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5026     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5027   %}
5028   ins_pipe( pipe_slow );
5029 %}
5030 
5031 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5032   predicate(UseAVX > 0);
5033   match(Set dst (MulReductionVI src1 src2));
5034   effect(TEMP tmp, TEMP tmp2);
5035   format %{ "vextracti128_high  $tmp,$src2\n\t"
5036             "vpmulld  $tmp,$tmp,$src2\n\t"
5037             "pshufd   $tmp2,$tmp,0xE\n\t"
5038             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5039             "pshufd   $tmp2,$tmp,0x1\n\t"
5040             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5041             "movd     $tmp2,$src1\n\t"
5042             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5043             "movd     $dst,$tmp2\t! mul reduction8I" %}
5044   ins_encode %{
5045     int vector_len = 0;
5046     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5047     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5048     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5049     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5050     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5051     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5052     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5053     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5054     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5055   %}
5056   ins_pipe( pipe_slow );
5057 %}
5058 
5059 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5060   predicate(UseAVX > 2);
5061   match(Set dst (MulReductionVI src1 src2));
5062   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5063   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5064             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5065             "vextracti128_high  $tmp,$tmp3\n\t"
5066             "vpmulld  $tmp,$tmp,$src2\n\t"
5067             "pshufd   $tmp2,$tmp,0xE\n\t"
5068             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5069             "pshufd   $tmp2,$tmp,0x1\n\t"
5070             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5071             "movd     $tmp2,$src1\n\t"
5072             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5073             "movd     $dst,$tmp2\t! mul reduction16I" %}
5074   ins_encode %{
5075     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5076     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5077     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5078     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5079     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5080     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5081     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5082     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5083     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5084     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5085     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5086   %}
5087   ins_pipe( pipe_slow );
5088 %}
5089 
5090 #ifdef _LP64
5091 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5092   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5093   match(Set dst (MulReductionVL src1 src2));
5094   effect(TEMP tmp, TEMP tmp2);
5095   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5096             "vpmullq  $tmp,$src2,$tmp2\n\t"
5097             "movdq    $tmp2,$src1\n\t"
5098             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5099             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5100   ins_encode %{
5101     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5102     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5103     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5104     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5105     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5106   %}
5107   ins_pipe( pipe_slow );
5108 %}
5109 
5110 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5111   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5112   match(Set dst (MulReductionVL src1 src2));
5113   effect(TEMP tmp, TEMP tmp2);
5114   format %{ "vextracti128_high  $tmp,$src2\n\t"
5115             "vpmullq  $tmp2,$tmp,$src2\n\t"
5116             "pshufd   $tmp,$tmp2,0xE\n\t"
5117             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5118             "movdq    $tmp,$src1\n\t"
5119             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5120             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5121   ins_encode %{
5122     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5123     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5124     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5125     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5126     __ movdq($tmp$$XMMRegister, $src1$$Register);
5127     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5128     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5129   %}
5130   ins_pipe( pipe_slow );
5131 %}
5132 
5133 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5134   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5135   match(Set dst (MulReductionVL src1 src2));
5136   effect(TEMP tmp, TEMP tmp2);
5137   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5138             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5139             "vextracti128_high  $tmp,$tmp2\n\t"
5140             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5141             "pshufd   $tmp,$tmp2,0xE\n\t"
5142             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5143             "movdq    $tmp,$src1\n\t"
5144             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5145             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5146   ins_encode %{
5147     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5148     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5149     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5150     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5151     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5152     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5153     __ movdq($tmp$$XMMRegister, $src1$$Register);
5154     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5155     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5156   %}
5157   ins_pipe( pipe_slow );
5158 %}
5159 #endif
5160 
5161 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5162   predicate(UseSSE >= 1 && UseAVX == 0);
5163   match(Set dst (MulReductionVF dst src2));
5164   effect(TEMP dst, TEMP tmp);
5165   format %{ "mulss   $dst,$src2\n\t"
5166             "pshufd  $tmp,$src2,0x01\n\t"
5167             "mulss   $dst,$tmp\t! mul reduction2F" %}
5168   ins_encode %{
5169     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5170     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5171     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5172   %}
5173   ins_pipe( pipe_slow );
5174 %}
5175 
5176 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5177   predicate(UseAVX > 0);
5178   match(Set dst (MulReductionVF dst src2));
5179   effect(TEMP tmp, TEMP dst);
5180   format %{ "vmulss  $dst,$dst,$src2\n\t"
5181             "pshufd  $tmp,$src2,0x01\n\t"
5182             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5183   ins_encode %{
5184     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5185     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5186     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5187   %}
5188   ins_pipe( pipe_slow );
5189 %}
5190 
5191 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5192   predicate(UseSSE >= 1 && UseAVX == 0);
5193   match(Set dst (MulReductionVF dst src2));
5194   effect(TEMP dst, TEMP tmp);
5195   format %{ "mulss   $dst,$src2\n\t"
5196             "pshufd  $tmp,$src2,0x01\n\t"
5197             "mulss   $dst,$tmp\n\t"
5198             "pshufd  $tmp,$src2,0x02\n\t"
5199             "mulss   $dst,$tmp\n\t"
5200             "pshufd  $tmp,$src2,0x03\n\t"
5201             "mulss   $dst,$tmp\t! mul reduction4F" %}
5202   ins_encode %{
5203     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5204     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5205     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5206     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5207     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5208     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5209     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5210   %}
5211   ins_pipe( pipe_slow );
5212 %}
5213 
5214 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5215   predicate(UseAVX > 0);
5216   match(Set dst (MulReductionVF dst src2));
5217   effect(TEMP tmp, TEMP dst);
5218   format %{ "vmulss  $dst,$dst,$src2\n\t"
5219             "pshufd  $tmp,$src2,0x01\n\t"
5220             "vmulss  $dst,$dst,$tmp\n\t"
5221             "pshufd  $tmp,$src2,0x02\n\t"
5222             "vmulss  $dst,$dst,$tmp\n\t"
5223             "pshufd  $tmp,$src2,0x03\n\t"
5224             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5225   ins_encode %{
5226     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5227     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5228     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5229     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5230     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5231     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5232     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5233   %}
5234   ins_pipe( pipe_slow );
5235 %}
5236 
5237 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5238   predicate(UseAVX > 0);
5239   match(Set dst (MulReductionVF dst src2));
5240   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5241   format %{ "vmulss  $dst,$dst,$src2\n\t"
5242             "pshufd  $tmp,$src2,0x01\n\t"
5243             "vmulss  $dst,$dst,$tmp\n\t"
5244             "pshufd  $tmp,$src2,0x02\n\t"
5245             "vmulss  $dst,$dst,$tmp\n\t"
5246             "pshufd  $tmp,$src2,0x03\n\t"
5247             "vmulss  $dst,$dst,$tmp\n\t"
5248             "vextractf128_high  $tmp2,$src2\n\t"
5249             "vmulss  $dst,$dst,$tmp2\n\t"
5250             "pshufd  $tmp,$tmp2,0x01\n\t"
5251             "vmulss  $dst,$dst,$tmp\n\t"
5252             "pshufd  $tmp,$tmp2,0x02\n\t"
5253             "vmulss  $dst,$dst,$tmp\n\t"
5254             "pshufd  $tmp,$tmp2,0x03\n\t"
5255             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5256   ins_encode %{
5257     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5258     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5259     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5260     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5261     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5262     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5263     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5264     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5265     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5266     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5267     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5268     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5269     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5270     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5271     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5272   %}
5273   ins_pipe( pipe_slow );
5274 %}
5275 
5276 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5277   predicate(UseAVX > 2);
5278   match(Set dst (MulReductionVF dst src2));
5279   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5280   format %{ "vmulss  $dst,$dst,$src2\n\t"
5281             "pshufd  $tmp,$src2,0x01\n\t"
5282             "vmulss  $dst,$dst,$tmp\n\t"
5283             "pshufd  $tmp,$src2,0x02\n\t"
5284             "vmulss  $dst,$dst,$tmp\n\t"
5285             "pshufd  $tmp,$src2,0x03\n\t"
5286             "vmulss  $dst,$dst,$tmp\n\t"
5287             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5288             "vmulss  $dst,$dst,$tmp2\n\t"
5289             "pshufd  $tmp,$tmp2,0x01\n\t"
5290             "vmulss  $dst,$dst,$tmp\n\t"
5291             "pshufd  $tmp,$tmp2,0x02\n\t"
5292             "vmulss  $dst,$dst,$tmp\n\t"
5293             "pshufd  $tmp,$tmp2,0x03\n\t"
5294             "vmulss  $dst,$dst,$tmp\n\t"
5295             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5296             "vmulss  $dst,$dst,$tmp2\n\t"
5297             "pshufd  $tmp,$tmp2,0x01\n\t"
5298             "vmulss  $dst,$dst,$tmp\n\t"
5299             "pshufd  $tmp,$tmp2,0x02\n\t"
5300             "vmulss  $dst,$dst,$tmp\n\t"
5301             "pshufd  $tmp,$tmp2,0x03\n\t"
5302             "vmulss  $dst,$dst,$tmp\n\t"
5303             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5304             "vmulss  $dst,$dst,$tmp2\n\t"
5305             "pshufd  $tmp,$tmp2,0x01\n\t"
5306             "vmulss  $dst,$dst,$tmp\n\t"
5307             "pshufd  $tmp,$tmp2,0x02\n\t"
5308             "vmulss  $dst,$dst,$tmp\n\t"
5309             "pshufd  $tmp,$tmp2,0x03\n\t"
5310             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5311   ins_encode %{
5312     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5313     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5314     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5315     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5316     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5317     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5318     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5319     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5320     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5321     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5322     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5323     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5324     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5325     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5326     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5327     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5328     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5329     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5330     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5331     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5332     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5333     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5334     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5335     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5336     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5337     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5338     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5339     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5340     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5341     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5342     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5343   %}
5344   ins_pipe( pipe_slow );
5345 %}
5346 
5347 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5348   predicate(UseSSE >= 1 && UseAVX == 0);
5349   match(Set dst (MulReductionVD dst src2));
5350   effect(TEMP dst, TEMP tmp);
5351   format %{ "mulsd   $dst,$src2\n\t"
5352             "pshufd  $tmp,$src2,0xE\n\t"
5353             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5354   ins_encode %{
5355     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5356     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5357     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5358   %}
5359   ins_pipe( pipe_slow );
5360 %}
5361 
5362 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5363   predicate(UseAVX > 0);
5364   match(Set dst (MulReductionVD dst src2));
5365   effect(TEMP tmp, TEMP dst);
5366   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5367             "pshufd  $tmp,$src2,0xE\n\t"
5368             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5369   ins_encode %{
5370     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5371     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5372     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5373   %}
5374   ins_pipe( pipe_slow );
5375 %}
5376 
5377 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5378   predicate(UseAVX > 0);
5379   match(Set dst (MulReductionVD dst src2));
5380   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5381   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5382             "pshufd  $tmp,$src2,0xE\n\t"
5383             "vmulsd  $dst,$dst,$tmp\n\t"
5384             "vextractf128_high  $tmp2,$src2\n\t"
5385             "vmulsd  $dst,$dst,$tmp2\n\t"
5386             "pshufd  $tmp,$tmp2,0xE\n\t"
5387             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5388   ins_encode %{
5389     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5390     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5391     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5392     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5393     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5394     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5395     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5396   %}
5397   ins_pipe( pipe_slow );
5398 %}
5399 
5400 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5401   predicate(UseAVX > 2);
5402   match(Set dst (MulReductionVD dst src2));
5403   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5404   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5405             "pshufd  $tmp,$src2,0xE\n\t"
5406             "vmulsd  $dst,$dst,$tmp\n\t"
5407             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5408             "vmulsd  $dst,$dst,$tmp2\n\t"
5409             "pshufd  $tmp,$src2,0xE\n\t"
5410             "vmulsd  $dst,$dst,$tmp\n\t"
5411             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5412             "vmulsd  $dst,$dst,$tmp2\n\t"
5413             "pshufd  $tmp,$tmp2,0xE\n\t"
5414             "vmulsd  $dst,$dst,$tmp\n\t"
5415             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5416             "vmulsd  $dst,$dst,$tmp2\n\t"
5417             "pshufd  $tmp,$tmp2,0xE\n\t"
5418             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5419   ins_encode %{
5420     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5421     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5422     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5423     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5424     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5425     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5426     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5427     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5428     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5429     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5430     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5431     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5432     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5433     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5434     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5435   %}
5436   ins_pipe( pipe_slow );
5437 %}
5438 
5439 // ====================VECTOR ARITHMETIC=======================================
5440 
5441 // --------------------------------- ADD --------------------------------------
5442 
5443 // Bytes vector add
5444 instruct vadd4B(vecS dst, vecS src) %{
5445   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5446   match(Set dst (AddVB dst src));
5447   format %{ "paddb   $dst,$src\t! add packed4B" %}
5448   ins_encode %{
5449     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5450   %}
5451   ins_pipe( pipe_slow );
5452 %}
5453 
5454 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5455   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5456   match(Set dst (AddVB src1 src2));
5457   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5458   ins_encode %{
5459     int vector_len = 0;
5460     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5461   %}
5462   ins_pipe( pipe_slow );
5463 %}
5464 
5465 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5466   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5467   match(Set dst (AddVB src1 src2));
5468   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5469   ins_encode %{
5470     int vector_len = 0;
5471     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5472   %}
5473   ins_pipe( pipe_slow );
5474 %}
5475 
5476 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5477   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5478   match(Set dst (AddVB dst src2));
5479   effect(TEMP src1);
5480   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5481   ins_encode %{
5482     int vector_len = 0;
5483     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5484   %}
5485   ins_pipe( pipe_slow );
5486 %}
5487 
5488 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5489   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5490   match(Set dst (AddVB src (LoadVector mem)));
5491   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5492   ins_encode %{
5493     int vector_len = 0;
5494     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5495   %}
5496   ins_pipe( pipe_slow );
5497 %}
5498 
5499 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5500   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5501   match(Set dst (AddVB src (LoadVector mem)));
5502   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5503   ins_encode %{
5504     int vector_len = 0;
5505     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5506   %}
5507   ins_pipe( pipe_slow );
5508 %}
5509 
5510 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5511   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5512   match(Set dst (AddVB dst (LoadVector mem)));
5513   effect(TEMP src);
5514   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5515   ins_encode %{
5516     int vector_len = 0;
5517     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5518   %}
5519   ins_pipe( pipe_slow );
5520 %}
5521 
5522 instruct vadd8B(vecD dst, vecD src) %{
5523   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5524   match(Set dst (AddVB dst src));
5525   format %{ "paddb   $dst,$src\t! add packed8B" %}
5526   ins_encode %{
5527     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5528   %}
5529   ins_pipe( pipe_slow );
5530 %}
5531 
5532 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
5533   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5534   match(Set dst (AddVB src1 src2));
5535   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5536   ins_encode %{
5537     int vector_len = 0;
5538     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5539   %}
5540   ins_pipe( pipe_slow );
5541 %}
5542 
5543 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5544   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5545   match(Set dst (AddVB src1 src2));
5546   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5547   ins_encode %{
5548     int vector_len = 0;
5549     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5550   %}
5551   ins_pipe( pipe_slow );
5552 %}
5553 
5554 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5555   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5556   match(Set dst (AddVB dst src2));
5557   effect(TEMP src1);
5558   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5559   ins_encode %{
5560     int vector_len = 0;
5561     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5562   %}
5563   ins_pipe( pipe_slow );
5564 %}
5565 
5566 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5567   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5568   match(Set dst (AddVB src (LoadVector mem)));
5569   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5570   ins_encode %{
5571     int vector_len = 0;
5572     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5573   %}
5574   ins_pipe( pipe_slow );
5575 %}
5576 
5577 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5578   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5579   match(Set dst (AddVB src (LoadVector mem)));
5580   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5581   ins_encode %{
5582     int vector_len = 0;
5583     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5584   %}
5585   ins_pipe( pipe_slow );
5586 %}
5587 
5588 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5589   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5590   match(Set dst (AddVB dst (LoadVector mem)));
5591   effect(TEMP src);
5592   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5593   ins_encode %{
5594     int vector_len = 0;
5595     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5596   %}
5597   ins_pipe( pipe_slow );
5598 %}
5599 
5600 instruct vadd16B(vecX dst, vecX src) %{
5601   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5602   match(Set dst (AddVB dst src));
5603   format %{ "paddb   $dst,$src\t! add packed16B" %}
5604   ins_encode %{
5605     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5606   %}
5607   ins_pipe( pipe_slow );
5608 %}
5609 
5610 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5611   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5612   match(Set dst (AddVB src1 src2));
5613   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5614   ins_encode %{
5615     int vector_len = 0;
5616     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5617   %}
5618   ins_pipe( pipe_slow );
5619 %}
5620 
5621 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5622   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5623   match(Set dst (AddVB src1 src2));
5624   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5625   ins_encode %{
5626     int vector_len = 0;
5627     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5628   %}
5629   ins_pipe( pipe_slow );
5630 %}
5631 
5632 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5633   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5634   match(Set dst (AddVB dst src2));
5635   effect(TEMP src1);
5636   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5637   ins_encode %{
5638     int vector_len = 0;
5639     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5640   %}
5641   ins_pipe( pipe_slow );
5642 %}
5643 
5644 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5645   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5646   match(Set dst (AddVB src (LoadVector mem)));
5647   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5648   ins_encode %{
5649     int vector_len = 0;
5650     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5651   %}
5652   ins_pipe( pipe_slow );
5653 %}
5654 
5655 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5656   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5657   match(Set dst (AddVB src (LoadVector mem)));
5658   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5659   ins_encode %{
5660     int vector_len = 0;
5661     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5662   %}
5663   ins_pipe( pipe_slow );
5664 %}
5665 
5666 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5667   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5668   match(Set dst (AddVB dst (LoadVector mem)));
5669   effect(TEMP src);
5670   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5671   ins_encode %{
5672     int vector_len = 0;
5673     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5674   %}
5675   ins_pipe( pipe_slow );
5676 %}
5677 
5678 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5679   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5680   match(Set dst (AddVB src1 src2));
5681   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5682   ins_encode %{
5683     int vector_len = 1;
5684     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5685   %}
5686   ins_pipe( pipe_slow );
5687 %}
5688 
5689 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5690   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5691   match(Set dst (AddVB src1 src2));
5692   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5693   ins_encode %{
5694     int vector_len = 1;
5695     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5696   %}
5697   ins_pipe( pipe_slow );
5698 %}
5699 
5700 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5701   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5702   match(Set dst (AddVB dst src2));
5703   effect(TEMP src1);
5704   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5705   ins_encode %{
5706     int vector_len = 1;
5707     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5708   %}
5709   ins_pipe( pipe_slow );
5710 %}
5711 
5712 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5713   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5714   match(Set dst (AddVB src (LoadVector mem)));
5715   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5716   ins_encode %{
5717     int vector_len = 1;
5718     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5719   %}
5720   ins_pipe( pipe_slow );
5721 %}
5722 
5723 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5724   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5725   match(Set dst (AddVB src (LoadVector mem)));
5726   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5727   ins_encode %{
5728     int vector_len = 1;
5729     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5730   %}
5731   ins_pipe( pipe_slow );
5732 %}
5733 
5734 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
5735   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5736   match(Set dst (AddVB dst (LoadVector mem)));
5737   effect(TEMP src);
5738   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5739   ins_encode %{
5740     int vector_len = 1;
5741     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5742   %}
5743   ins_pipe( pipe_slow );
5744 %}
5745 
5746 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5747   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5748   match(Set dst (AddVB src1 src2));
5749   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5750   ins_encode %{
5751     int vector_len = 2;
5752     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5753   %}
5754   ins_pipe( pipe_slow );
5755 %}
5756 
5757 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5758   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5759   match(Set dst (AddVB src (LoadVector mem)));
5760   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
5761   ins_encode %{
5762     int vector_len = 2;
5763     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5764   %}
5765   ins_pipe( pipe_slow );
5766 %}
5767 
5768 // Shorts/Chars vector add
5769 instruct vadd2S(vecS dst, vecS src) %{
5770   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
5771   match(Set dst (AddVS dst src));
5772   format %{ "paddw   $dst,$src\t! add packed2S" %}
5773   ins_encode %{
5774     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5775   %}
5776   ins_pipe( pipe_slow );
5777 %}
5778 
5779 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
5780   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5781   match(Set dst (AddVS src1 src2));
5782   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5783   ins_encode %{
5784     int vector_len = 0;
5785     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5786   %}
5787   ins_pipe( pipe_slow );
5788 %}
5789 
5790 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
5791   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5792   match(Set dst (AddVS src1 src2));
5793   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5794   ins_encode %{
5795     int vector_len = 0;
5796     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5797   %}
5798   ins_pipe( pipe_slow );
5799 %}
5800 
5801 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5802   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
5803   match(Set dst (AddVS dst src2));
5804   effect(TEMP src1);
5805   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
5806   ins_encode %{
5807     int vector_len = 0;
5808     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5809   %}
5810   ins_pipe( pipe_slow );
5811 %}
5812 
5813 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
5814   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5815   match(Set dst (AddVS src (LoadVector mem)));
5816   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5817   ins_encode %{
5818     int vector_len = 0;
5819     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5820   %}
5821   ins_pipe( pipe_slow );
5822 %}
5823 
5824 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
5825   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5826   match(Set dst (AddVS src (LoadVector mem)));
5827   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5828   ins_encode %{
5829     int vector_len = 0;
5830     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5831   %}
5832   ins_pipe( pipe_slow );
5833 %}
5834 
5835 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
5836   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5837   match(Set dst (AddVS dst (LoadVector mem)));
5838   effect(TEMP src);
5839   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5840   ins_encode %{
5841     int vector_len = 0;
5842     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5843   %}
5844   ins_pipe( pipe_slow );
5845 %}
5846 
5847 instruct vadd4S(vecD dst, vecD src) %{
5848   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5849   match(Set dst (AddVS dst src));
5850   format %{ "paddw   $dst,$src\t! add packed4S" %}
5851   ins_encode %{
5852     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5853   %}
5854   ins_pipe( pipe_slow );
5855 %}
5856 
5857 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
5858   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5859   match(Set dst (AddVS src1 src2));
5860   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5861   ins_encode %{
5862     int vector_len = 0;
5863     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5864   %}
5865   ins_pipe( pipe_slow );
5866 %}
5867 
5868 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
5869   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5870   match(Set dst (AddVS src1 src2));
5871   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5872   ins_encode %{
5873     int vector_len = 0;
5874     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5875   %}
5876   ins_pipe( pipe_slow );
5877 %}
5878 
5879 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5880   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5881   match(Set dst (AddVS dst src2));
5882   effect(TEMP src1);
5883   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
5884   ins_encode %{
5885     int vector_len = 0;
5886     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
5892   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5893   match(Set dst (AddVS src (LoadVector mem)));
5894   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5895   ins_encode %{
5896     int vector_len = 0;
5897     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5898   %}
5899   ins_pipe( pipe_slow );
5900 %}
5901 
5902 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
5903   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5904   match(Set dst (AddVS src (LoadVector mem)));
5905   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5906   ins_encode %{
5907     int vector_len = 0;
5908     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5909   %}
5910   ins_pipe( pipe_slow );
5911 %}
5912 
5913 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
5914   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5915   match(Set dst (AddVS dst (LoadVector mem)));
5916   effect(TEMP src);
5917   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5918   ins_encode %{
5919     int vector_len = 0;
5920     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5921   %}
5922   ins_pipe( pipe_slow );
5923 %}
5924 
5925 instruct vadd8S(vecX dst, vecX src) %{
5926   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5927   match(Set dst (AddVS dst src));
5928   format %{ "paddw   $dst,$src\t! add packed8S" %}
5929   ins_encode %{
5930     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5931   %}
5932   ins_pipe( pipe_slow );
5933 %}
5934 
5935 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
5936   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5937   match(Set dst (AddVS src1 src2));
5938   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5939   ins_encode %{
5940     int vector_len = 0;
5941     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5942   %}
5943   ins_pipe( pipe_slow );
5944 %}
5945 
5946 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
5947   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5948   match(Set dst (AddVS src1 src2));
5949   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5950   ins_encode %{
5951     int vector_len = 0;
5952     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5953   %}
5954   ins_pipe( pipe_slow );
5955 %}
5956 
5957 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5958   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5959   match(Set dst (AddVS dst src2));
5960   effect(TEMP src1);
5961   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
5962   ins_encode %{
5963     int vector_len = 0;
5964     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5965   %}
5966   ins_pipe( pipe_slow );
5967 %}
5968 
5969 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
5970   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5971   match(Set dst (AddVS src (LoadVector mem)));
5972   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5973   ins_encode %{
5974     int vector_len = 0;
5975     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5976   %}
5977   ins_pipe( pipe_slow );
5978 %}
5979 
5980 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
5981   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5982   match(Set dst (AddVS src (LoadVector mem)));
5983   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5984   ins_encode %{
5985     int vector_len = 0;
5986     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5987   %}
5988   ins_pipe( pipe_slow );
5989 %}
5990 
5991 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
5992   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5993   match(Set dst (AddVS dst (LoadVector mem)));
5994   effect(TEMP src);
5995   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5996   ins_encode %{
5997     int vector_len = 0;
5998     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5999   %}
6000   ins_pipe( pipe_slow );
6001 %}
6002 
6003 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6004   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6005   match(Set dst (AddVS src1 src2));
6006   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6007   ins_encode %{
6008     int vector_len = 1;
6009     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6010   %}
6011   ins_pipe( pipe_slow );
6012 %}
6013 
6014 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6015   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6016   match(Set dst (AddVS src1 src2));
6017   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6018   ins_encode %{
6019     int vector_len = 1;
6020     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6021   %}
6022   ins_pipe( pipe_slow );
6023 %}
6024 
6025 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6026   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6027   match(Set dst (AddVS dst src2));
6028   effect(TEMP src1);
6029   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6030   ins_encode %{
6031     int vector_len = 1;
6032     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6033   %}
6034   ins_pipe( pipe_slow );
6035 %}
6036 
6037 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6038   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6039   match(Set dst (AddVS src (LoadVector mem)));
6040   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6041   ins_encode %{
6042     int vector_len = 1;
6043     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6044   %}
6045   ins_pipe( pipe_slow );
6046 %}
6047 
6048 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6049   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6050   match(Set dst (AddVS src (LoadVector mem)));
6051   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6052   ins_encode %{
6053     int vector_len = 1;
6054     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6055   %}
6056   ins_pipe( pipe_slow );
6057 %}
6058 
6059 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6060   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6061   match(Set dst (AddVS dst (LoadVector mem)));
6062   effect(TEMP src);
6063   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6064   ins_encode %{
6065     int vector_len = 1;
6066     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6067   %}
6068   ins_pipe( pipe_slow );
6069 %}
6070 
6071 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6072   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6073   match(Set dst (AddVS src1 src2));
6074   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6075   ins_encode %{
6076     int vector_len = 2;
6077     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6078   %}
6079   ins_pipe( pipe_slow );
6080 %}
6081 
6082 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6083   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6084   match(Set dst (AddVS src (LoadVector mem)));
6085   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6086   ins_encode %{
6087     int vector_len = 2;
6088     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6089   %}
6090   ins_pipe( pipe_slow );
6091 %}
6092 
6093 // Integers vector add
6094 instruct vadd2I(vecD dst, vecD src) %{
6095   predicate(n->as_Vector()->length() == 2);
6096   match(Set dst (AddVI dst src));
6097   format %{ "paddd   $dst,$src\t! add packed2I" %}
6098   ins_encode %{
6099     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6100   %}
6101   ins_pipe( pipe_slow );
6102 %}
6103 
6104 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6105   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6106   match(Set dst (AddVI src1 src2));
6107   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6108   ins_encode %{
6109     int vector_len = 0;
6110     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6111   %}
6112   ins_pipe( pipe_slow );
6113 %}
6114 
6115 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6116   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6117   match(Set dst (AddVI src (LoadVector mem)));
6118   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6119   ins_encode %{
6120     int vector_len = 0;
6121     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6122   %}
6123   ins_pipe( pipe_slow );
6124 %}
6125 
6126 instruct vadd4I(vecX dst, vecX src) %{
6127   predicate(n->as_Vector()->length() == 4);
6128   match(Set dst (AddVI dst src));
6129   format %{ "paddd   $dst,$src\t! add packed4I" %}
6130   ins_encode %{
6131     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6132   %}
6133   ins_pipe( pipe_slow );
6134 %}
6135 
6136 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6137   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6138   match(Set dst (AddVI src1 src2));
6139   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6140   ins_encode %{
6141     int vector_len = 0;
6142     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6143   %}
6144   ins_pipe( pipe_slow );
6145 %}
6146 
6147 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6148   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6149   match(Set dst (AddVI src (LoadVector mem)));
6150   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6151   ins_encode %{
6152     int vector_len = 0;
6153     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6154   %}
6155   ins_pipe( pipe_slow );
6156 %}
6157 
6158 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6159   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6160   match(Set dst (AddVI src1 src2));
6161   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6162   ins_encode %{
6163     int vector_len = 1;
6164     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6165   %}
6166   ins_pipe( pipe_slow );
6167 %}
6168 
6169 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6170   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6171   match(Set dst (AddVI src (LoadVector mem)));
6172   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6173   ins_encode %{
6174     int vector_len = 1;
6175     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6176   %}
6177   ins_pipe( pipe_slow );
6178 %}
6179 
6180 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6181   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6182   match(Set dst (AddVI src1 src2));
6183   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6184   ins_encode %{
6185     int vector_len = 2;
6186     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6187   %}
6188   ins_pipe( pipe_slow );
6189 %}
6190 
6191 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6192   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6193   match(Set dst (AddVI src (LoadVector mem)));
6194   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6195   ins_encode %{
6196     int vector_len = 2;
6197     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6198   %}
6199   ins_pipe( pipe_slow );
6200 %}
6201 
6202 // Longs vector add
6203 instruct vadd2L(vecX dst, vecX src) %{
6204   predicate(n->as_Vector()->length() == 2);
6205   match(Set dst (AddVL dst src));
6206   format %{ "paddq   $dst,$src\t! add packed2L" %}
6207   ins_encode %{
6208     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6209   %}
6210   ins_pipe( pipe_slow );
6211 %}
6212 
6213 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6214   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6215   match(Set dst (AddVL src1 src2));
6216   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6217   ins_encode %{
6218     int vector_len = 0;
6219     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6220   %}
6221   ins_pipe( pipe_slow );
6222 %}
6223 
6224 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6225   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6226   match(Set dst (AddVL src (LoadVector mem)));
6227   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6228   ins_encode %{
6229     int vector_len = 0;
6230     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6231   %}
6232   ins_pipe( pipe_slow );
6233 %}
6234 
6235 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6236   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6237   match(Set dst (AddVL src1 src2));
6238   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6239   ins_encode %{
6240     int vector_len = 1;
6241     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6242   %}
6243   ins_pipe( pipe_slow );
6244 %}
6245 
6246 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6247   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6248   match(Set dst (AddVL src (LoadVector mem)));
6249   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6250   ins_encode %{
6251     int vector_len = 1;
6252     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6253   %}
6254   ins_pipe( pipe_slow );
6255 %}
6256 
6257 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6258   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6259   match(Set dst (AddVL src1 src2));
6260   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6261   ins_encode %{
6262     int vector_len = 2;
6263     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6264   %}
6265   ins_pipe( pipe_slow );
6266 %}
6267 
6268 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6269   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6270   match(Set dst (AddVL src (LoadVector mem)));
6271   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6272   ins_encode %{
6273     int vector_len = 2;
6274     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6275   %}
6276   ins_pipe( pipe_slow );
6277 %}
6278 
6279 // Floats vector add
6280 instruct vadd2F(vecD dst, vecD src) %{
6281   predicate(n->as_Vector()->length() == 2);
6282   match(Set dst (AddVF dst src));
6283   format %{ "addps   $dst,$src\t! add packed2F" %}
6284   ins_encode %{
6285     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6286   %}
6287   ins_pipe( pipe_slow );
6288 %}
6289 
6290 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6291   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6292   match(Set dst (AddVF src1 src2));
6293   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6294   ins_encode %{
6295     int vector_len = 0;
6296     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6297   %}
6298   ins_pipe( pipe_slow );
6299 %}
6300 
6301 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6302   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6303   match(Set dst (AddVF src (LoadVector mem)));
6304   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6305   ins_encode %{
6306     int vector_len = 0;
6307     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6308   %}
6309   ins_pipe( pipe_slow );
6310 %}
6311 
6312 instruct vadd4F(vecX dst, vecX src) %{
6313   predicate(n->as_Vector()->length() == 4);
6314   match(Set dst (AddVF dst src));
6315   format %{ "addps   $dst,$src\t! add packed4F" %}
6316   ins_encode %{
6317     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6318   %}
6319   ins_pipe( pipe_slow );
6320 %}
6321 
6322 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6323   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6324   match(Set dst (AddVF src1 src2));
6325   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6326   ins_encode %{
6327     int vector_len = 0;
6328     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6329   %}
6330   ins_pipe( pipe_slow );
6331 %}
6332 
6333 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6334   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6335   match(Set dst (AddVF src (LoadVector mem)));
6336   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6337   ins_encode %{
6338     int vector_len = 0;
6339     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6340   %}
6341   ins_pipe( pipe_slow );
6342 %}
6343 
6344 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6345   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6346   match(Set dst (AddVF src1 src2));
6347   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6348   ins_encode %{
6349     int vector_len = 1;
6350     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6351   %}
6352   ins_pipe( pipe_slow );
6353 %}
6354 
6355 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6356   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6357   match(Set dst (AddVF src (LoadVector mem)));
6358   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6359   ins_encode %{
6360     int vector_len = 1;
6361     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6362   %}
6363   ins_pipe( pipe_slow );
6364 %}
6365 
6366 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6367   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6368   match(Set dst (AddVF src1 src2));
6369   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6370   ins_encode %{
6371     int vector_len = 2;
6372     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6373   %}
6374   ins_pipe( pipe_slow );
6375 %}
6376 
6377 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6378   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6379   match(Set dst (AddVF src (LoadVector mem)));
6380   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6381   ins_encode %{
6382     int vector_len = 2;
6383     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6384   %}
6385   ins_pipe( pipe_slow );
6386 %}
6387 
6388 // Doubles vector add
6389 instruct vadd2D(vecX dst, vecX src) %{
6390   predicate(n->as_Vector()->length() == 2);
6391   match(Set dst (AddVD dst src));
6392   format %{ "addpd   $dst,$src\t! add packed2D" %}
6393   ins_encode %{
6394     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6395   %}
6396   ins_pipe( pipe_slow );
6397 %}
6398 
6399 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6400   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6401   match(Set dst (AddVD src1 src2));
6402   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6403   ins_encode %{
6404     int vector_len = 0;
6405     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6406   %}
6407   ins_pipe( pipe_slow );
6408 %}
6409 
6410 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6411   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6412   match(Set dst (AddVD src (LoadVector mem)));
6413   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6414   ins_encode %{
6415     int vector_len = 0;
6416     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6417   %}
6418   ins_pipe( pipe_slow );
6419 %}
6420 
6421 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6422   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6423   match(Set dst (AddVD src1 src2));
6424   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6425   ins_encode %{
6426     int vector_len = 1;
6427     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6428   %}
6429   ins_pipe( pipe_slow );
6430 %}
6431 
6432 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6433   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6434   match(Set dst (AddVD src (LoadVector mem)));
6435   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6436   ins_encode %{
6437     int vector_len = 1;
6438     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6439   %}
6440   ins_pipe( pipe_slow );
6441 %}
6442 
6443 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6444   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6445   match(Set dst (AddVD src1 src2));
6446   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6447   ins_encode %{
6448     int vector_len = 2;
6449     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6450   %}
6451   ins_pipe( pipe_slow );
6452 %}
6453 
6454 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6455   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6456   match(Set dst (AddVD src (LoadVector mem)));
6457   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6458   ins_encode %{
6459     int vector_len = 2;
6460     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6461   %}
6462   ins_pipe( pipe_slow );
6463 %}
6464 
6465 // --------------------------------- SUB --------------------------------------
6466 
6467 // Bytes vector sub
6468 instruct vsub4B(vecS dst, vecS src) %{
6469   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6470   match(Set dst (SubVB dst src));
6471   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6472   ins_encode %{
6473     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6474   %}
6475   ins_pipe( pipe_slow );
6476 %}
6477 
6478 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6479   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6480   match(Set dst (SubVB src1 src2));
6481   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6482   ins_encode %{
6483     int vector_len = 0;
6484     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6485   %}
6486   ins_pipe( pipe_slow );
6487 %}
6488 
6489 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6490   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6491   match(Set dst (SubVB src1 src2));
6492   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6493   ins_encode %{
6494     int vector_len = 0;
6495     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6496   %}
6497   ins_pipe( pipe_slow );
6498 %}
6499 
6500 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6501   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6502   match(Set dst (SubVB dst src2));
6503   effect(TEMP src1);
6504   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6505   ins_encode %{
6506     int vector_len = 0;
6507     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6508   %}
6509   ins_pipe( pipe_slow );
6510 %}
6511 
6512 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6513   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6514   match(Set dst (SubVB src (LoadVector mem)));
6515   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6516   ins_encode %{
6517     int vector_len = 0;
6518     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6519   %}
6520   ins_pipe( pipe_slow );
6521 %}
6522 
6523 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6524   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6525   match(Set dst (SubVB src (LoadVector mem)));
6526   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6527   ins_encode %{
6528     int vector_len = 0;
6529     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6530   %}
6531   ins_pipe( pipe_slow );
6532 %}
6533 
6534 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6535   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6536   match(Set dst (SubVB dst (LoadVector mem)));
6537   effect(TEMP src);
6538   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6539   ins_encode %{
6540     int vector_len = 0;
6541     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6542   %}
6543   ins_pipe( pipe_slow );
6544 %}
6545 
6546 instruct vsub8B(vecD dst, vecD src) %{
6547   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6548   match(Set dst (SubVB dst src));
6549   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6550   ins_encode %{
6551     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6552   %}
6553   ins_pipe( pipe_slow );
6554 %}
6555 
6556 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6557   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6558   match(Set dst (SubVB src1 src2));
6559   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6560   ins_encode %{
6561     int vector_len = 0;
6562     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6563   %}
6564   ins_pipe( pipe_slow );
6565 %}
6566 
6567 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6568   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6569   match(Set dst (SubVB src1 src2));
6570   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6571   ins_encode %{
6572     int vector_len = 0;
6573     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6574   %}
6575   ins_pipe( pipe_slow );
6576 %}
6577 
6578 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6579   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6580   match(Set dst (SubVB dst src2));
6581   effect(TEMP src1);
6582   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6583   ins_encode %{
6584     int vector_len = 0;
6585     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6586   %}
6587   ins_pipe( pipe_slow );
6588 %}
6589 
6590 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6591   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6592   match(Set dst (SubVB src (LoadVector mem)));
6593   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6594   ins_encode %{
6595     int vector_len = 0;
6596     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6597   %}
6598   ins_pipe( pipe_slow );
6599 %}
6600 
6601 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6602   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6603   match(Set dst (SubVB src (LoadVector mem)));
6604   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6605   ins_encode %{
6606     int vector_len = 0;
6607     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6608   %}
6609   ins_pipe( pipe_slow );
6610 %}
6611 
6612 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6613   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6614   match(Set dst (SubVB dst (LoadVector mem)));
6615   effect(TEMP src);
6616   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6617   ins_encode %{
6618     int vector_len = 0;
6619     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6620   %}
6621   ins_pipe( pipe_slow );
6622 %}
6623 
6624 instruct vsub16B(vecX dst, vecX src) %{
6625   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6626   match(Set dst (SubVB dst src));
6627   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6628   ins_encode %{
6629     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6630   %}
6631   ins_pipe( pipe_slow );
6632 %}
6633 
6634 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6635   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6636   match(Set dst (SubVB src1 src2));
6637   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6638   ins_encode %{
6639     int vector_len = 0;
6640     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6641   %}
6642   ins_pipe( pipe_slow );
6643 %}
6644 
6645 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6646   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6647   match(Set dst (SubVB src1 src2));
6648   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6649   ins_encode %{
6650     int vector_len = 0;
6651     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6652   %}
6653   ins_pipe( pipe_slow );
6654 %}
6655 
6656 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6657   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6658   match(Set dst (SubVB dst src2));
6659   effect(TEMP src1);
6660   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6661   ins_encode %{
6662     int vector_len = 0;
6663     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6664   %}
6665   ins_pipe( pipe_slow );
6666 %}
6667 
6668 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6669   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6670   match(Set dst (SubVB src (LoadVector mem)));
6671   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6672   ins_encode %{
6673     int vector_len = 0;
6674     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6675   %}
6676   ins_pipe( pipe_slow );
6677 %}
6678 
6679 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6680   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6681   match(Set dst (SubVB src (LoadVector mem)));
6682   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6683   ins_encode %{
6684     int vector_len = 0;
6685     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6686   %}
6687   ins_pipe( pipe_slow );
6688 %}
6689 
6690 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6691   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6692   match(Set dst (SubVB dst (LoadVector mem)));
6693   effect(TEMP src);
6694   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6695   ins_encode %{
6696     int vector_len = 0;
6697     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6698   %}
6699   ins_pipe( pipe_slow );
6700 %}
6701 
6702 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6703   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6704   match(Set dst (SubVB src1 src2));
6705   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6706   ins_encode %{
6707     int vector_len = 1;
6708     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6709   %}
6710   ins_pipe( pipe_slow );
6711 %}
6712 
6713 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6714   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6715   match(Set dst (SubVB src1 src2));
6716   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6717   ins_encode %{
6718     int vector_len = 1;
6719     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6720   %}
6721   ins_pipe( pipe_slow );
6722 %}
6723 
6724 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6725   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6726   match(Set dst (SubVB dst src2));
6727   effect(TEMP src1);
6728   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6729   ins_encode %{
6730     int vector_len = 1;
6731     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6732   %}
6733   ins_pipe( pipe_slow );
6734 %}
6735 
6736 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
6737   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6738   match(Set dst (SubVB src (LoadVector mem)));
6739   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6740   ins_encode %{
6741     int vector_len = 1;
6742     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6743   %}
6744   ins_pipe( pipe_slow );
6745 %}
6746 
6747 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
6748   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6749   match(Set dst (SubVB src (LoadVector mem)));
6750   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6751   ins_encode %{
6752     int vector_len = 1;
6753     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6754   %}
6755   ins_pipe( pipe_slow );
6756 %}
6757 
6758 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6759   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6760   match(Set dst (SubVB dst (LoadVector mem)));
6761   effect(TEMP src);
6762   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6763   ins_encode %{
6764     int vector_len = 1;
6765     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6766   %}
6767   ins_pipe( pipe_slow );
6768 %}
6769 
6770 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6771   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6772   match(Set dst (SubVB src1 src2));
6773   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6774   ins_encode %{
6775     int vector_len = 2;
6776     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6777   %}
6778   ins_pipe( pipe_slow );
6779 %}
6780 
6781 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6782   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6783   match(Set dst (SubVB src (LoadVector mem)));
6784   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6785   ins_encode %{
6786     int vector_len = 2;
6787     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6788   %}
6789   ins_pipe( pipe_slow );
6790 %}
6791 
6792 // Shorts/Chars vector sub
6793 instruct vsub2S(vecS dst, vecS src) %{
6794   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6795   match(Set dst (SubVS dst src));
6796   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6797   ins_encode %{
6798     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6799   %}
6800   ins_pipe( pipe_slow );
6801 %}
6802 
6803 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6804   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6805   match(Set dst (SubVS src1 src2));
6806   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6807   ins_encode %{
6808     int vector_len = 0;
6809     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6810   %}
6811   ins_pipe( pipe_slow );
6812 %}
6813 
6814 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6815   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6816   match(Set dst (SubVS src1 src2));
6817   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6818   ins_encode %{
6819     int vector_len = 0;
6820     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6821   %}
6822   ins_pipe( pipe_slow );
6823 %}
6824 
6825 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6826   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6827   match(Set dst (SubVS dst src2));
6828   effect(TEMP src1);
6829   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6830   ins_encode %{
6831     int vector_len = 0;
6832     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6833   %}
6834   ins_pipe( pipe_slow );
6835 %}
6836 
6837 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
6838   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6839   match(Set dst (SubVS src (LoadVector mem)));
6840   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6841   ins_encode %{
6842     int vector_len = 0;
6843     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6844   %}
6845   ins_pipe( pipe_slow );
6846 %}
6847 
6848 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
6849   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6850   match(Set dst (SubVS src (LoadVector mem)));
6851   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6852   ins_encode %{
6853     int vector_len = 0;
6854     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6855   %}
6856   ins_pipe( pipe_slow );
6857 %}
6858 
6859 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6860   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6861   match(Set dst (SubVS dst (LoadVector mem)));
6862   effect(TEMP src);
6863   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6864   ins_encode %{
6865     int vector_len = 0;
6866     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6867   %}
6868   ins_pipe( pipe_slow );
6869 %}
6870 
6871 instruct vsub4S(vecD dst, vecD src) %{
6872   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6873   match(Set dst (SubVS dst src));
6874   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6875   ins_encode %{
6876     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6877   %}
6878   ins_pipe( pipe_slow );
6879 %}
6880 
6881 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
6882   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6883   match(Set dst (SubVS src1 src2));
6884   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6885   ins_encode %{
6886     int vector_len = 0;
6887     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6888   %}
6889   ins_pipe( pipe_slow );
6890 %}
6891 
6892 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6893   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6894   match(Set dst (SubVS src1 src2));
6895   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6896   ins_encode %{
6897     int vector_len = 0;
6898     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6899   %}
6900   ins_pipe( pipe_slow );
6901 %}
6902 
6903 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6904   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6905   match(Set dst (SubVS dst src2));
6906   effect(TEMP src1);
6907   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6908   ins_encode %{
6909     int vector_len = 0;
6910     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6911   %}
6912   ins_pipe( pipe_slow );
6913 %}
6914 
6915 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
6916   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6917   match(Set dst (SubVS src (LoadVector mem)));
6918   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6919   ins_encode %{
6920     int vector_len = 0;
6921     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6922   %}
6923   ins_pipe( pipe_slow );
6924 %}
6925 
6926 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
6927   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6928   match(Set dst (SubVS src (LoadVector mem)));
6929   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6930   ins_encode %{
6931     int vector_len = 0;
6932     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6933   %}
6934   ins_pipe( pipe_slow );
6935 %}
6936 
6937 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6938   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6939   match(Set dst (SubVS dst (LoadVector mem)));
6940   effect(TEMP src);
6941   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6942   ins_encode %{
6943     int vector_len = 0;
6944     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6945   %}
6946   ins_pipe( pipe_slow );
6947 %}
6948 
6949 instruct vsub8S(vecX dst, vecX src) %{
6950   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6951   match(Set dst (SubVS dst src));
6952   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6953   ins_encode %{
6954     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6955   %}
6956   ins_pipe( pipe_slow );
6957 %}
6958 
6959 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6960   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6961   match(Set dst (SubVS src1 src2));
6962   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6963   ins_encode %{
6964     int vector_len = 0;
6965     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6966   %}
6967   ins_pipe( pipe_slow );
6968 %}
6969 
6970 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6971   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6972   match(Set dst (SubVS src1 src2));
6973   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6974   ins_encode %{
6975     int vector_len = 0;
6976     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6977   %}
6978   ins_pipe( pipe_slow );
6979 %}
6980 
6981 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6982   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6983   match(Set dst (SubVS dst src2));
6984   effect(TEMP src1);
6985   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6986   ins_encode %{
6987     int vector_len = 0;
6988     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6989   %}
6990   ins_pipe( pipe_slow );
6991 %}
6992 
6993 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
6994   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6995   match(Set dst (SubVS src (LoadVector mem)));
6996   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6997   ins_encode %{
6998     int vector_len = 0;
6999     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7000   %}
7001   ins_pipe( pipe_slow );
7002 %}
7003 
7004 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
7005   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7006   match(Set dst (SubVS src (LoadVector mem)));
7007   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7008   ins_encode %{
7009     int vector_len = 0;
7010     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7011   %}
7012   ins_pipe( pipe_slow );
7013 %}
7014 
7015 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7016   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7017   match(Set dst (SubVS dst (LoadVector mem)));
7018   effect(TEMP src);
7019   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7020   ins_encode %{
7021     int vector_len = 0;
7022     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7023   %}
7024   ins_pipe( pipe_slow );
7025 %}
7026 
7027 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7028   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7029   match(Set dst (SubVS src1 src2));
7030   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7031   ins_encode %{
7032     int vector_len = 1;
7033     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7034   %}
7035   ins_pipe( pipe_slow );
7036 %}
7037 
7038 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7039   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7040   match(Set dst (SubVS src1 src2));
7041   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7042   ins_encode %{
7043     int vector_len = 1;
7044     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7045   %}
7046   ins_pipe( pipe_slow );
7047 %}
7048 
7049 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7050   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7051   match(Set dst (SubVS dst src2));
7052   effect(TEMP src1);
7053   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7054   ins_encode %{
7055     int vector_len = 1;
7056     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7057   %}
7058   ins_pipe( pipe_slow );
7059 %}
7060 
7061 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7062   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7063   match(Set dst (SubVS src (LoadVector mem)));
7064   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7065   ins_encode %{
7066     int vector_len = 1;
7067     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7068   %}
7069   ins_pipe( pipe_slow );
7070 %}
7071 
7072 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7073   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7074   match(Set dst (SubVS src (LoadVector mem)));
7075   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7076   ins_encode %{
7077     int vector_len = 1;
7078     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7079   %}
7080   ins_pipe( pipe_slow );
7081 %}
7082 
7083 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7084   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7085   match(Set dst (SubVS dst (LoadVector mem)));
7086    effect(TEMP src);
7087   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7088   ins_encode %{
7089     int vector_len = 1;
7090     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7091   %}
7092   ins_pipe( pipe_slow );
7093 %}
7094 
7095 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7096   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7097   match(Set dst (SubVS src1 src2));
7098   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7099   ins_encode %{
7100     int vector_len = 2;
7101     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7102   %}
7103   ins_pipe( pipe_slow );
7104 %}
7105 
7106 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7107   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7108   match(Set dst (SubVS src (LoadVector mem)));
7109   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7110   ins_encode %{
7111     int vector_len = 2;
7112     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7113   %}
7114   ins_pipe( pipe_slow );
7115 %}
7116 
7117 // Integers vector sub
7118 instruct vsub2I(vecD dst, vecD src) %{
7119   predicate(n->as_Vector()->length() == 2);
7120   match(Set dst (SubVI dst src));
7121   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7122   ins_encode %{
7123     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7124   %}
7125   ins_pipe( pipe_slow );
7126 %}
7127 
7128 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7129   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7130   match(Set dst (SubVI src1 src2));
7131   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7132   ins_encode %{
7133     int vector_len = 0;
7134     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7135   %}
7136   ins_pipe( pipe_slow );
7137 %}
7138 
7139 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7140   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7141   match(Set dst (SubVI src (LoadVector mem)));
7142   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7143   ins_encode %{
7144     int vector_len = 0;
7145     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7146   %}
7147   ins_pipe( pipe_slow );
7148 %}
7149 
7150 instruct vsub4I(vecX dst, vecX src) %{
7151   predicate(n->as_Vector()->length() == 4);
7152   match(Set dst (SubVI dst src));
7153   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7154   ins_encode %{
7155     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7156   %}
7157   ins_pipe( pipe_slow );
7158 %}
7159 
7160 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7161   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7162   match(Set dst (SubVI src1 src2));
7163   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7164   ins_encode %{
7165     int vector_len = 0;
7166     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7167   %}
7168   ins_pipe( pipe_slow );
7169 %}
7170 
7171 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7172   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7173   match(Set dst (SubVI src (LoadVector mem)));
7174   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7175   ins_encode %{
7176     int vector_len = 0;
7177     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7178   %}
7179   ins_pipe( pipe_slow );
7180 %}
7181 
7182 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7183   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7184   match(Set dst (SubVI src1 src2));
7185   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7186   ins_encode %{
7187     int vector_len = 1;
7188     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7189   %}
7190   ins_pipe( pipe_slow );
7191 %}
7192 
7193 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7194   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7195   match(Set dst (SubVI src (LoadVector mem)));
7196   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7197   ins_encode %{
7198     int vector_len = 1;
7199     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7200   %}
7201   ins_pipe( pipe_slow );
7202 %}
7203 
7204 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7205   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7206   match(Set dst (SubVI src1 src2));
7207   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7208   ins_encode %{
7209     int vector_len = 2;
7210     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7211   %}
7212   ins_pipe( pipe_slow );
7213 %}
7214 
7215 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7216   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7217   match(Set dst (SubVI src (LoadVector mem)));
7218   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7219   ins_encode %{
7220     int vector_len = 2;
7221     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7222   %}
7223   ins_pipe( pipe_slow );
7224 %}
7225 
7226 // Longs vector sub
7227 instruct vsub2L(vecX dst, vecX src) %{
7228   predicate(n->as_Vector()->length() == 2);
7229   match(Set dst (SubVL dst src));
7230   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7231   ins_encode %{
7232     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7233   %}
7234   ins_pipe( pipe_slow );
7235 %}
7236 
7237 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7238   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7239   match(Set dst (SubVL src1 src2));
7240   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7241   ins_encode %{
7242     int vector_len = 0;
7243     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7244   %}
7245   ins_pipe( pipe_slow );
7246 %}
7247 
7248 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7249   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7250   match(Set dst (SubVL src (LoadVector mem)));
7251   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7252   ins_encode %{
7253     int vector_len = 0;
7254     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7255   %}
7256   ins_pipe( pipe_slow );
7257 %}
7258 
7259 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7260   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7261   match(Set dst (SubVL src1 src2));
7262   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7263   ins_encode %{
7264     int vector_len = 1;
7265     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7266   %}
7267   ins_pipe( pipe_slow );
7268 %}
7269 
7270 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7271   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7272   match(Set dst (SubVL src (LoadVector mem)));
7273   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7274   ins_encode %{
7275     int vector_len = 1;
7276     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7277   %}
7278   ins_pipe( pipe_slow );
7279 %}
7280 
7281 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7282   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7283   match(Set dst (SubVL src1 src2));
7284   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7285   ins_encode %{
7286     int vector_len = 2;
7287     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7288   %}
7289   ins_pipe( pipe_slow );
7290 %}
7291 
7292 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7293   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7294   match(Set dst (SubVL src (LoadVector mem)));
7295   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7296   ins_encode %{
7297     int vector_len = 2;
7298     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7299   %}
7300   ins_pipe( pipe_slow );
7301 %}
7302 
7303 // Floats vector sub
7304 instruct vsub2F(vecD dst, vecD src) %{
7305   predicate(n->as_Vector()->length() == 2);
7306   match(Set dst (SubVF dst src));
7307   format %{ "subps   $dst,$src\t! sub packed2F" %}
7308   ins_encode %{
7309     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7310   %}
7311   ins_pipe( pipe_slow );
7312 %}
7313 
7314 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7315   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7316   match(Set dst (SubVF src1 src2));
7317   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7318   ins_encode %{
7319     int vector_len = 0;
7320     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7321   %}
7322   ins_pipe( pipe_slow );
7323 %}
7324 
7325 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7326   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7327   match(Set dst (SubVF src (LoadVector mem)));
7328   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7329   ins_encode %{
7330     int vector_len = 0;
7331     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7332   %}
7333   ins_pipe( pipe_slow );
7334 %}
7335 
7336 instruct vsub4F(vecX dst, vecX src) %{
7337   predicate(n->as_Vector()->length() == 4);
7338   match(Set dst (SubVF dst src));
7339   format %{ "subps   $dst,$src\t! sub packed4F" %}
7340   ins_encode %{
7341     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7342   %}
7343   ins_pipe( pipe_slow );
7344 %}
7345 
7346 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7347   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7348   match(Set dst (SubVF src1 src2));
7349   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7350   ins_encode %{
7351     int vector_len = 0;
7352     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7353   %}
7354   ins_pipe( pipe_slow );
7355 %}
7356 
7357 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7358   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7359   match(Set dst (SubVF src (LoadVector mem)));
7360   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7361   ins_encode %{
7362     int vector_len = 0;
7363     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7364   %}
7365   ins_pipe( pipe_slow );
7366 %}
7367 
7368 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7369   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7370   match(Set dst (SubVF src1 src2));
7371   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7372   ins_encode %{
7373     int vector_len = 1;
7374     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7375   %}
7376   ins_pipe( pipe_slow );
7377 %}
7378 
7379 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7380   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7381   match(Set dst (SubVF src (LoadVector mem)));
7382   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7383   ins_encode %{
7384     int vector_len = 1;
7385     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7386   %}
7387   ins_pipe( pipe_slow );
7388 %}
7389 
7390 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7391   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7392   match(Set dst (SubVF src1 src2));
7393   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7394   ins_encode %{
7395     int vector_len = 2;
7396     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7397   %}
7398   ins_pipe( pipe_slow );
7399 %}
7400 
7401 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7402   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7403   match(Set dst (SubVF src (LoadVector mem)));
7404   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7405   ins_encode %{
7406     int vector_len = 2;
7407     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7408   %}
7409   ins_pipe( pipe_slow );
7410 %}
7411 
7412 // Doubles vector sub
7413 instruct vsub2D(vecX dst, vecX src) %{
7414   predicate(n->as_Vector()->length() == 2);
7415   match(Set dst (SubVD dst src));
7416   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7417   ins_encode %{
7418     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7419   %}
7420   ins_pipe( pipe_slow );
7421 %}
7422 
7423 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7424   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7425   match(Set dst (SubVD src1 src2));
7426   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7427   ins_encode %{
7428     int vector_len = 0;
7429     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7430   %}
7431   ins_pipe( pipe_slow );
7432 %}
7433 
7434 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7435   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7436   match(Set dst (SubVD src (LoadVector mem)));
7437   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7438   ins_encode %{
7439     int vector_len = 0;
7440     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7441   %}
7442   ins_pipe( pipe_slow );
7443 %}
7444 
7445 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7446   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7447   match(Set dst (SubVD src1 src2));
7448   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7449   ins_encode %{
7450     int vector_len = 1;
7451     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7452   %}
7453   ins_pipe( pipe_slow );
7454 %}
7455 
7456 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7457   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7458   match(Set dst (SubVD src (LoadVector mem)));
7459   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7460   ins_encode %{
7461     int vector_len = 1;
7462     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7463   %}
7464   ins_pipe( pipe_slow );
7465 %}
7466 
7467 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7468   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7469   match(Set dst (SubVD src1 src2));
7470   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7471   ins_encode %{
7472     int vector_len = 2;
7473     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7474   %}
7475   ins_pipe( pipe_slow );
7476 %}
7477 
7478 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7479   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7480   match(Set dst (SubVD src (LoadVector mem)));
7481   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7482   ins_encode %{
7483     int vector_len = 2;
7484     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7485   %}
7486   ins_pipe( pipe_slow );
7487 %}
7488 
7489 // --------------------------------- MUL --------------------------------------
7490 
7491 // Shorts/Chars vector mul
7492 instruct vmul2S(vecS dst, vecS src) %{
7493   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7494   match(Set dst (MulVS dst src));
7495   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7496   ins_encode %{
7497     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7498   %}
7499   ins_pipe( pipe_slow );
7500 %}
7501 
7502 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7503   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7504   match(Set dst (MulVS src1 src2));
7505   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7506   ins_encode %{
7507     int vector_len = 0;
7508     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7509   %}
7510   ins_pipe( pipe_slow );
7511 %}
7512 
7513 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7514   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7515   match(Set dst (MulVS src1 src2));
7516   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7517   ins_encode %{
7518     int vector_len = 0;
7519     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7520   %}
7521   ins_pipe( pipe_slow );
7522 %}
7523 
7524 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7525   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7526   match(Set dst (MulVS dst src2));
7527   effect(TEMP src1);
7528   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7529   ins_encode %{
7530     int vector_len = 0;
7531     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7532   %}
7533   ins_pipe( pipe_slow );
7534 %}
7535 
7536 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7537   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7538   match(Set dst (MulVS src (LoadVector mem)));
7539   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7540   ins_encode %{
7541     int vector_len = 0;
7542     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7543   %}
7544   ins_pipe( pipe_slow );
7545 %}
7546 
7547 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7548   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7549   match(Set dst (MulVS src (LoadVector mem)));
7550   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7551   ins_encode %{
7552     int vector_len = 0;
7553     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7554   %}
7555   ins_pipe( pipe_slow );
7556 %}
7557 
7558 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7559   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7560   match(Set dst (MulVS dst (LoadVector mem)));
7561   effect(TEMP src);
7562   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7563   ins_encode %{
7564     int vector_len = 0;
7565     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct vmul4S(vecD dst, vecD src) %{
7571   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7572   match(Set dst (MulVS dst src));
7573   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7574   ins_encode %{
7575     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7576   %}
7577   ins_pipe( pipe_slow );
7578 %}
7579 
7580 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7581   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7582   match(Set dst (MulVS src1 src2));
7583   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7584   ins_encode %{
7585     int vector_len = 0;
7586     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7587   %}
7588   ins_pipe( pipe_slow );
7589 %}
7590 
7591 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7592   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7593   match(Set dst (MulVS src1 src2));
7594   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7595   ins_encode %{
7596     int vector_len = 0;
7597     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7598   %}
7599   ins_pipe( pipe_slow );
7600 %}
7601 
7602 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7603   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7604   match(Set dst (MulVS dst src2));
7605   effect(TEMP src1);
7606   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7607   ins_encode %{
7608     int vector_len = 0;
7609     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7610   %}
7611   ins_pipe( pipe_slow );
7612 %}
7613 
7614 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7615   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7616   match(Set dst (MulVS src (LoadVector mem)));
7617   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7618   ins_encode %{
7619     int vector_len = 0;
7620     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7621   %}
7622   ins_pipe( pipe_slow );
7623 %}
7624 
7625 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7626   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7627   match(Set dst (MulVS src (LoadVector mem)));
7628   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7629   ins_encode %{
7630     int vector_len = 0;
7631     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7632   %}
7633   ins_pipe( pipe_slow );
7634 %}
7635 
7636 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7637   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7638   match(Set dst (MulVS dst (LoadVector mem)));
7639   effect(TEMP src);
7640   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7641   ins_encode %{
7642     int vector_len = 0;
7643     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7644   %}
7645   ins_pipe( pipe_slow );
7646 %}
7647 
7648 instruct vmul8S(vecX dst, vecX src) %{
7649   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7650   match(Set dst (MulVS dst src));
7651   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7652   ins_encode %{
7653     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7654   %}
7655   ins_pipe( pipe_slow );
7656 %}
7657 
7658 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7659   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7660   match(Set dst (MulVS src1 src2));
7661   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7662   ins_encode %{
7663     int vector_len = 0;
7664     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7665   %}
7666   ins_pipe( pipe_slow );
7667 %}
7668 
7669 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7670   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7671   match(Set dst (MulVS src1 src2));
7672   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7673   ins_encode %{
7674     int vector_len = 0;
7675     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7676   %}
7677   ins_pipe( pipe_slow );
7678 %}
7679 
7680 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7681   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7682   match(Set dst (MulVS dst src2));
7683   effect(TEMP src1);
7684   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7685   ins_encode %{
7686     int vector_len = 0;
7687     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7688   %}
7689   ins_pipe( pipe_slow );
7690 %}
7691 
7692 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7693   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7694   match(Set dst (MulVS src (LoadVector mem)));
7695   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7696   ins_encode %{
7697     int vector_len = 0;
7698     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7699   %}
7700   ins_pipe( pipe_slow );
7701 %}
7702 
7703 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7704   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7705   match(Set dst (MulVS src (LoadVector mem)));
7706   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7707   ins_encode %{
7708     int vector_len = 0;
7709     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7710   %}
7711   ins_pipe( pipe_slow );
7712 %}
7713 
7714 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7715   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7716   match(Set dst (MulVS dst (LoadVector mem)));
7717   effect(TEMP src);
7718   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7719   ins_encode %{
7720     int vector_len = 0;
7721     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7722   %}
7723   ins_pipe( pipe_slow );
7724 %}
7725 
7726 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7727   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7728   match(Set dst (MulVS src1 src2));
7729   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7730   ins_encode %{
7731     int vector_len = 1;
7732     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7733   %}
7734   ins_pipe( pipe_slow );
7735 %}
7736 
7737 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7738   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7739   match(Set dst (MulVS src1 src2));
7740   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7741   ins_encode %{
7742     int vector_len = 1;
7743     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7744   %}
7745   ins_pipe( pipe_slow );
7746 %}
7747 
7748 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7749   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7750   match(Set dst (MulVS dst src2));
7751   effect(TEMP src1);
7752   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7753   ins_encode %{
7754     int vector_len = 1;
7755     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7756   %}
7757   ins_pipe( pipe_slow );
7758 %}
7759 
7760 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
7761   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7762   match(Set dst (MulVS src (LoadVector mem)));
7763   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7764   ins_encode %{
7765     int vector_len = 1;
7766     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7767   %}
7768   ins_pipe( pipe_slow );
7769 %}
7770 
7771 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
7772   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7773   match(Set dst (MulVS src (LoadVector mem)));
7774   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7775   ins_encode %{
7776     int vector_len = 1;
7777     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7778   %}
7779   ins_pipe( pipe_slow );
7780 %}
7781 
7782 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7783   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7784   match(Set dst (MulVS dst (LoadVector mem)));
7785   effect(TEMP src);
7786   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7787   ins_encode %{
7788     int vector_len = 1;
7789     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7790   %}
7791   ins_pipe( pipe_slow );
7792 %}
7793 
7794 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7795   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7796   match(Set dst (MulVS src1 src2));
7797   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7798   ins_encode %{
7799     int vector_len = 2;
7800     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7801   %}
7802   ins_pipe( pipe_slow );
7803 %}
7804 
7805 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7806   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7807   match(Set dst (MulVS src (LoadVector mem)));
7808   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7809   ins_encode %{
7810     int vector_len = 2;
7811     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7812   %}
7813   ins_pipe( pipe_slow );
7814 %}
7815 
7816 // Integers vector mul (sse4_1)
7817 instruct vmul2I(vecD dst, vecD src) %{
7818   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7819   match(Set dst (MulVI dst src));
7820   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7821   ins_encode %{
7822     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7823   %}
7824   ins_pipe( pipe_slow );
7825 %}
7826 
7827 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7828   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7829   match(Set dst (MulVI src1 src2));
7830   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7831   ins_encode %{
7832     int vector_len = 0;
7833     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7834   %}
7835   ins_pipe( pipe_slow );
7836 %}
7837 
7838 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7839   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7840   match(Set dst (MulVI src (LoadVector mem)));
7841   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7842   ins_encode %{
7843     int vector_len = 0;
7844     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7845   %}
7846   ins_pipe( pipe_slow );
7847 %}
7848 
7849 instruct vmul4I(vecX dst, vecX src) %{
7850   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7851   match(Set dst (MulVI dst src));
7852   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7853   ins_encode %{
7854     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7855   %}
7856   ins_pipe( pipe_slow );
7857 %}
7858 
7859 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7860   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7861   match(Set dst (MulVI src1 src2));
7862   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7863   ins_encode %{
7864     int vector_len = 0;
7865     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7866   %}
7867   ins_pipe( pipe_slow );
7868 %}
7869 
7870 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7871   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7872   match(Set dst (MulVI src (LoadVector mem)));
7873   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7874   ins_encode %{
7875     int vector_len = 0;
7876     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7877   %}
7878   ins_pipe( pipe_slow );
7879 %}
7880 
7881 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7882   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7883   match(Set dst (MulVL src1 src2));
7884   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7885   ins_encode %{
7886     int vector_len = 0;
7887     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7888   %}
7889   ins_pipe( pipe_slow );
7890 %}
7891 
7892 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7893   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7894   match(Set dst (MulVL src (LoadVector mem)));
7895   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7896   ins_encode %{
7897     int vector_len = 0;
7898     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7899   %}
7900   ins_pipe( pipe_slow );
7901 %}
7902 
7903 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7904   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7905   match(Set dst (MulVL src1 src2));
7906   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7907   ins_encode %{
7908     int vector_len = 1;
7909     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7910   %}
7911   ins_pipe( pipe_slow );
7912 %}
7913 
7914 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7915   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7916   match(Set dst (MulVL src (LoadVector mem)));
7917   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7918   ins_encode %{
7919     int vector_len = 1;
7920     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7921   %}
7922   ins_pipe( pipe_slow );
7923 %}
7924 
7925 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7926   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7927   match(Set dst (MulVL src1 src2));
7928   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7929   ins_encode %{
7930     int vector_len = 2;
7931     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7932   %}
7933   ins_pipe( pipe_slow );
7934 %}
7935 
7936 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7937   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7938   match(Set dst (MulVL src (LoadVector mem)));
7939   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7940   ins_encode %{
7941     int vector_len = 2;
7942     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7943   %}
7944   ins_pipe( pipe_slow );
7945 %}
7946 
7947 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7948   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7949   match(Set dst (MulVI src1 src2));
7950   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7951   ins_encode %{
7952     int vector_len = 1;
7953     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7954   %}
7955   ins_pipe( pipe_slow );
7956 %}
7957 
7958 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7959   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7960   match(Set dst (MulVI src (LoadVector mem)));
7961   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7962   ins_encode %{
7963     int vector_len = 1;
7964     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7965   %}
7966   ins_pipe( pipe_slow );
7967 %}
7968 
7969 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7970   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7971   match(Set dst (MulVI src1 src2));
7972   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7973   ins_encode %{
7974     int vector_len = 2;
7975     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7976   %}
7977   ins_pipe( pipe_slow );
7978 %}
7979 
7980 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7981   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7982   match(Set dst (MulVI src (LoadVector mem)));
7983   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7984   ins_encode %{
7985     int vector_len = 2;
7986     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7987   %}
7988   ins_pipe( pipe_slow );
7989 %}
7990 
7991 // Floats vector mul
7992 instruct vmul2F(vecD dst, vecD src) %{
7993   predicate(n->as_Vector()->length() == 2);
7994   match(Set dst (MulVF dst src));
7995   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7996   ins_encode %{
7997     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7998   %}
7999   ins_pipe( pipe_slow );
8000 %}
8001 
8002 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
8003   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8004   match(Set dst (MulVF src1 src2));
8005   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
8006   ins_encode %{
8007     int vector_len = 0;
8008     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8009   %}
8010   ins_pipe( pipe_slow );
8011 %}
8012 
8013 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
8014   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8015   match(Set dst (MulVF src (LoadVector mem)));
8016   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
8017   ins_encode %{
8018     int vector_len = 0;
8019     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8020   %}
8021   ins_pipe( pipe_slow );
8022 %}
8023 
8024 instruct vmul4F(vecX dst, vecX src) %{
8025   predicate(n->as_Vector()->length() == 4);
8026   match(Set dst (MulVF dst src));
8027   format %{ "mulps   $dst,$src\t! mul packed4F" %}
8028   ins_encode %{
8029     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8030   %}
8031   ins_pipe( pipe_slow );
8032 %}
8033 
8034 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8035   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8036   match(Set dst (MulVF src1 src2));
8037   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8038   ins_encode %{
8039     int vector_len = 0;
8040     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8041   %}
8042   ins_pipe( pipe_slow );
8043 %}
8044 
8045 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8046   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8047   match(Set dst (MulVF src (LoadVector mem)));
8048   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8049   ins_encode %{
8050     int vector_len = 0;
8051     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8052   %}
8053   ins_pipe( pipe_slow );
8054 %}
8055 
8056 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8057   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8058   match(Set dst (MulVF src1 src2));
8059   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8060   ins_encode %{
8061     int vector_len = 1;
8062     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8063   %}
8064   ins_pipe( pipe_slow );
8065 %}
8066 
8067 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8068   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8069   match(Set dst (MulVF src (LoadVector mem)));
8070   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8071   ins_encode %{
8072     int vector_len = 1;
8073     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8074   %}
8075   ins_pipe( pipe_slow );
8076 %}
8077 
8078 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8079   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8080   match(Set dst (MulVF src1 src2));
8081   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8082   ins_encode %{
8083     int vector_len = 2;
8084     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8085   %}
8086   ins_pipe( pipe_slow );
8087 %}
8088 
8089 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8090   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8091   match(Set dst (MulVF src (LoadVector mem)));
8092   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8093   ins_encode %{
8094     int vector_len = 2;
8095     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8096   %}
8097   ins_pipe( pipe_slow );
8098 %}
8099 
8100 // Doubles vector mul
8101 instruct vmul2D(vecX dst, vecX src) %{
8102   predicate(n->as_Vector()->length() == 2);
8103   match(Set dst (MulVD dst src));
8104   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8105   ins_encode %{
8106     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8107   %}
8108   ins_pipe( pipe_slow );
8109 %}
8110 
8111 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8112   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8113   match(Set dst (MulVD src1 src2));
8114   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8115   ins_encode %{
8116     int vector_len = 0;
8117     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8118   %}
8119   ins_pipe( pipe_slow );
8120 %}
8121 
8122 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8123   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8124   match(Set dst (MulVD src (LoadVector mem)));
8125   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8126   ins_encode %{
8127     int vector_len = 0;
8128     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8129   %}
8130   ins_pipe( pipe_slow );
8131 %}
8132 
8133 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8134   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8135   match(Set dst (MulVD src1 src2));
8136   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8137   ins_encode %{
8138     int vector_len = 1;
8139     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8140   %}
8141   ins_pipe( pipe_slow );
8142 %}
8143 
8144 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8145   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8146   match(Set dst (MulVD src (LoadVector mem)));
8147   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8148   ins_encode %{
8149     int vector_len = 1;
8150     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8151   %}
8152   ins_pipe( pipe_slow );
8153 %}
8154 
8155 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8156   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8157   match(Set dst (MulVD src1 src2));
8158   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8159   ins_encode %{
8160     int vector_len = 2;
8161     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8162   %}
8163   ins_pipe( pipe_slow );
8164 %}
8165 
8166 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8167   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8168   match(Set dst (MulVD src (LoadVector mem)));
8169   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8170   ins_encode %{
8171     int vector_len = 2;
8172     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8173   %}
8174   ins_pipe( pipe_slow );
8175 %}
8176 
8177 instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8178   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
8179   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
8180   effect(TEMP dst, USE src1, USE src2);
8181   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
8182             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
8183          %}
8184   ins_encode %{
8185     int vector_len = 1;
8186     int cond = (Assembler::Condition)($copnd$$cmpcode);
8187     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8188     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8189   %}
8190   ins_pipe( pipe_slow );
8191 %}
8192 
8193 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8194   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
8195   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
8196   effect(TEMP dst, USE src1, USE src2);
8197   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
8198             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
8199          %}
8200   ins_encode %{
8201     int vector_len = 1;
8202     int cond = (Assembler::Condition)($copnd$$cmpcode);
8203     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8204     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8205   %}
8206   ins_pipe( pipe_slow );
8207 %}
8208 
8209 // --------------------------------- DIV --------------------------------------
8210 
8211 // Floats vector div
8212 instruct vdiv2F(vecD dst, vecD src) %{
8213   predicate(n->as_Vector()->length() == 2);
8214   match(Set dst (DivVF dst src));
8215   format %{ "divps   $dst,$src\t! div packed2F" %}
8216   ins_encode %{
8217     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8218   %}
8219   ins_pipe( pipe_slow );
8220 %}
8221 
8222 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
8223   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8224   match(Set dst (DivVF src1 src2));
8225   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8226   ins_encode %{
8227     int vector_len = 0;
8228     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8229   %}
8230   ins_pipe( pipe_slow );
8231 %}
8232 
8233 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8234   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8235   match(Set dst (DivVF src (LoadVector mem)));
8236   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8237   ins_encode %{
8238     int vector_len = 0;
8239     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8240   %}
8241   ins_pipe( pipe_slow );
8242 %}
8243 
8244 instruct vdiv4F(vecX dst, vecX src) %{
8245   predicate(n->as_Vector()->length() == 4);
8246   match(Set dst (DivVF dst src));
8247   format %{ "divps   $dst,$src\t! div packed4F" %}
8248   ins_encode %{
8249     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8250   %}
8251   ins_pipe( pipe_slow );
8252 %}
8253 
8254 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8255   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8256   match(Set dst (DivVF src1 src2));
8257   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8258   ins_encode %{
8259     int vector_len = 0;
8260     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8261   %}
8262   ins_pipe( pipe_slow );
8263 %}
8264 
8265 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8266   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8267   match(Set dst (DivVF src (LoadVector mem)));
8268   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8269   ins_encode %{
8270     int vector_len = 0;
8271     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8272   %}
8273   ins_pipe( pipe_slow );
8274 %}
8275 
8276 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8277   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8278   match(Set dst (DivVF src1 src2));
8279   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8280   ins_encode %{
8281     int vector_len = 1;
8282     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8283   %}
8284   ins_pipe( pipe_slow );
8285 %}
8286 
8287 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8288   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8289   match(Set dst (DivVF src (LoadVector mem)));
8290   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8291   ins_encode %{
8292     int vector_len = 1;
8293     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8294   %}
8295   ins_pipe( pipe_slow );
8296 %}
8297 
8298 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8299   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8300   match(Set dst (DivVF src1 src2));
8301   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8302   ins_encode %{
8303     int vector_len = 2;
8304     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8305   %}
8306   ins_pipe( pipe_slow );
8307 %}
8308 
8309 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8310   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8311   match(Set dst (DivVF src (LoadVector mem)));
8312   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8313   ins_encode %{
8314     int vector_len = 2;
8315     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8316   %}
8317   ins_pipe( pipe_slow );
8318 %}
8319 
8320 // Doubles vector div
8321 instruct vdiv2D(vecX dst, vecX src) %{
8322   predicate(n->as_Vector()->length() == 2);
8323   match(Set dst (DivVD dst src));
8324   format %{ "divpd   $dst,$src\t! div packed2D" %}
8325   ins_encode %{
8326     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8327   %}
8328   ins_pipe( pipe_slow );
8329 %}
8330 
8331 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8332   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8333   match(Set dst (DivVD src1 src2));
8334   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8335   ins_encode %{
8336     int vector_len = 0;
8337     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8338   %}
8339   ins_pipe( pipe_slow );
8340 %}
8341 
8342 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8343   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8344   match(Set dst (DivVD src (LoadVector mem)));
8345   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8346   ins_encode %{
8347     int vector_len = 0;
8348     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8349   %}
8350   ins_pipe( pipe_slow );
8351 %}
8352 
8353 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8354   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8355   match(Set dst (DivVD src1 src2));
8356   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8357   ins_encode %{
8358     int vector_len = 1;
8359     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8360   %}
8361   ins_pipe( pipe_slow );
8362 %}
8363 
8364 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8365   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8366   match(Set dst (DivVD src (LoadVector mem)));
8367   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8368   ins_encode %{
8369     int vector_len = 1;
8370     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8371   %}
8372   ins_pipe( pipe_slow );
8373 %}
8374 
8375 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8376   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8377   match(Set dst (DivVD src1 src2));
8378   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8379   ins_encode %{
8380     int vector_len = 2;
8381     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8382   %}
8383   ins_pipe( pipe_slow );
8384 %}
8385 
8386 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8387   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8388   match(Set dst (DivVD src (LoadVector mem)));
8389   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8390   ins_encode %{
8391     int vector_len = 2;
8392     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8393   %}
8394   ins_pipe( pipe_slow );
8395 %}
8396 
8397 // ------------------------------ Shift ---------------------------------------
8398 
8399 // Left and right shift count vectors are the same on x86
8400 // (only lowest bits of xmm reg are used for count).
8401 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8402   match(Set dst (LShiftCntV cnt));
8403   match(Set dst (RShiftCntV cnt));
8404   format %{ "movd    $dst,$cnt\t! load shift count" %}
8405   ins_encode %{
8406     __ movdl($dst$$XMMRegister, $cnt$$Register);
8407   %}
8408   ins_pipe( pipe_slow );
8409 %}
8410 
8411 // --------------------------------- Sqrt --------------------------------------
8412 
8413 // Floating point vector sqrt
8414 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8415   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8416   match(Set dst (SqrtVD src));
8417   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8418   ins_encode %{
8419     int vector_len = 0;
8420     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8421   %}
8422   ins_pipe( pipe_slow );
8423 %}
8424 
8425 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8426   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8427   match(Set dst (SqrtVD (LoadVector mem)));
8428   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8429   ins_encode %{
8430     int vector_len = 0;
8431     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8432   %}
8433   ins_pipe( pipe_slow );
8434 %}
8435 
8436 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8437   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8438   match(Set dst (SqrtVD src));
8439   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8440   ins_encode %{
8441     int vector_len = 1;
8442     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8443   %}
8444   ins_pipe( pipe_slow );
8445 %}
8446 
8447 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8448   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8449   match(Set dst (SqrtVD (LoadVector mem)));
8450   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8451   ins_encode %{
8452     int vector_len = 1;
8453     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8454   %}
8455   ins_pipe( pipe_slow );
8456 %}
8457 
8458 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8459   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8460   match(Set dst (SqrtVD src));
8461   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8462   ins_encode %{
8463     int vector_len = 2;
8464     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8465   %}
8466   ins_pipe( pipe_slow );
8467 %}
8468 
8469 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8470   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8471   match(Set dst (SqrtVD (LoadVector mem)));
8472   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8473   ins_encode %{
8474     int vector_len = 2;
8475     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8476   %}
8477   ins_pipe( pipe_slow );
8478 %}
8479 
8480 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8481   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8482   match(Set dst (SqrtVF src));
8483   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8484   ins_encode %{
8485     int vector_len = 0;
8486     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8487   %}
8488   ins_pipe( pipe_slow );
8489 %}
8490 
8491 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8492   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8493   match(Set dst (SqrtVF (LoadVector mem)));
8494   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8495   ins_encode %{
8496     int vector_len = 0;
8497     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8498   %}
8499   ins_pipe( pipe_slow );
8500 %}
8501 
8502 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8503   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8504   match(Set dst (SqrtVF src));
8505   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8506   ins_encode %{
8507     int vector_len = 0;
8508     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8509   %}
8510   ins_pipe( pipe_slow );
8511 %}
8512 
8513 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8514   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8515   match(Set dst (SqrtVF (LoadVector mem)));
8516   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8517   ins_encode %{
8518     int vector_len = 0;
8519     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8520   %}
8521   ins_pipe( pipe_slow );
8522 %}
8523 
8524 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8525   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8526   match(Set dst (SqrtVF src));
8527   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8528   ins_encode %{
8529     int vector_len = 1;
8530     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8531   %}
8532   ins_pipe( pipe_slow );
8533 %}
8534 
8535 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8536   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8537   match(Set dst (SqrtVF (LoadVector mem)));
8538   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8539   ins_encode %{
8540     int vector_len = 1;
8541     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8542   %}
8543   ins_pipe( pipe_slow );
8544 %}
8545 
8546 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8547   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8548   match(Set dst (SqrtVF src));
8549   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8550   ins_encode %{
8551     int vector_len = 2;
8552     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8553   %}
8554   ins_pipe( pipe_slow );
8555 %}
8556 
8557 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8558   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8559   match(Set dst (SqrtVF (LoadVector mem)));
8560   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8561   ins_encode %{
8562     int vector_len = 2;
8563     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8564   %}
8565   ins_pipe( pipe_slow );
8566 %}
8567 
8568 // ------------------------------ LeftShift -----------------------------------
8569 
8570 // Shorts/Chars vector left shift
8571 instruct vsll2S(vecS dst, vecS shift) %{
8572   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8573   match(Set dst (LShiftVS dst shift));
8574   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8575   ins_encode %{
8576     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8577   %}
8578   ins_pipe( pipe_slow );
8579 %}
8580 
8581 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8582   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8583   match(Set dst (LShiftVS dst shift));
8584   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8585   ins_encode %{
8586     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8587   %}
8588   ins_pipe( pipe_slow );
8589 %}
8590 
8591 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8592   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8593   match(Set dst (LShiftVS src shift));
8594   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8595   ins_encode %{
8596     int vector_len = 0;
8597     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8598   %}
8599   ins_pipe( pipe_slow );
8600 %}
8601 
8602 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8603   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8604   match(Set dst (LShiftVS src shift));
8605   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8606   ins_encode %{
8607     int vector_len = 0;
8608     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8609   %}
8610   ins_pipe( pipe_slow );
8611 %}
8612 
8613 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8614   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8615   match(Set dst (LShiftVS dst shift));
8616   effect(TEMP src);
8617   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8618   ins_encode %{
8619     int vector_len = 0;
8620     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8621   %}
8622   ins_pipe( pipe_slow );
8623 %}
8624 
8625 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8626   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8627   match(Set dst (LShiftVS src shift));
8628   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8629   ins_encode %{
8630     int vector_len = 0;
8631     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8632   %}
8633   ins_pipe( pipe_slow );
8634 %}
8635 
8636 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8637   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8638   match(Set dst (LShiftVS src shift));
8639   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8640   ins_encode %{
8641     int vector_len = 0;
8642     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8643   %}
8644   ins_pipe( pipe_slow );
8645 %}
8646 
8647 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8648   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8649   match(Set dst (LShiftVS dst shift));
8650   effect(TEMP src);
8651   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8652   ins_encode %{
8653     int vector_len = 0;
8654     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8655   %}
8656   ins_pipe( pipe_slow );
8657 %}
8658 
8659 instruct vsll4S(vecD dst, vecS shift) %{
8660   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8661   match(Set dst (LShiftVS dst shift));
8662   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8663   ins_encode %{
8664     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8665   %}
8666   ins_pipe( pipe_slow );
8667 %}
8668 
8669 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8670   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8671   match(Set dst (LShiftVS dst shift));
8672   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8673   ins_encode %{
8674     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8675   %}
8676   ins_pipe( pipe_slow );
8677 %}
8678 
8679 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8680   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8681   match(Set dst (LShiftVS src shift));
8682   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8683   ins_encode %{
8684     int vector_len = 0;
8685     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8691   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8692   match(Set dst (LShiftVS src shift));
8693   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8694   ins_encode %{
8695     int vector_len = 0;
8696     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8702   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8703   match(Set dst (LShiftVS dst shift));
8704   effect(TEMP src);
8705   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8706   ins_encode %{
8707     int vector_len = 0;
8708     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8709   %}
8710   ins_pipe( pipe_slow );
8711 %}
8712 
8713 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8714   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8715   match(Set dst (LShiftVS src shift));
8716   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8717   ins_encode %{
8718     int vector_len = 0;
8719     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8720   %}
8721   ins_pipe( pipe_slow );
8722 %}
8723 
8724 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8725   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8726   match(Set dst (LShiftVS src shift));
8727   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8728   ins_encode %{
8729     int vector_len = 0;
8730     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8731   %}
8732   ins_pipe( pipe_slow );
8733 %}
8734 
8735 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8736   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8737   match(Set dst (LShiftVS dst shift));
8738   effect(TEMP src);
8739   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8740   ins_encode %{
8741     int vector_len = 0;
8742     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8743   %}
8744   ins_pipe( pipe_slow );
8745 %}
8746 
8747 instruct vsll8S(vecX dst, vecS shift) %{
8748   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8749   match(Set dst (LShiftVS dst shift));
8750   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8751   ins_encode %{
8752     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8753   %}
8754   ins_pipe( pipe_slow );
8755 %}
8756 
8757 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8758   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8759   match(Set dst (LShiftVS dst shift));
8760   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8761   ins_encode %{
8762     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8763   %}
8764   ins_pipe( pipe_slow );
8765 %}
8766 
8767 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8768   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8769   match(Set dst (LShiftVS src shift));
8770   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8771   ins_encode %{
8772     int vector_len = 0;
8773     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8774   %}
8775   ins_pipe( pipe_slow );
8776 %}
8777 
8778 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8779   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8780   match(Set dst (LShiftVS src shift));
8781   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8782   ins_encode %{
8783     int vector_len = 0;
8784     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8785   %}
8786   ins_pipe( pipe_slow );
8787 %}
8788 
8789 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8790   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8791   match(Set dst (LShiftVS dst shift));
8792   effect(TEMP src);
8793   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8794   ins_encode %{
8795     int vector_len = 0;
8796     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8797   %}
8798   ins_pipe( pipe_slow );
8799 %}
8800 
8801 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8802   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8803   match(Set dst (LShiftVS src shift));
8804   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8805   ins_encode %{
8806     int vector_len = 0;
8807     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8808   %}
8809   ins_pipe( pipe_slow );
8810 %}
8811 
8812 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8813   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8814   match(Set dst (LShiftVS src shift));
8815   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8816   ins_encode %{
8817     int vector_len = 0;
8818     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8819   %}
8820   ins_pipe( pipe_slow );
8821 %}
8822 
8823 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8824   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8825   match(Set dst (LShiftVS dst shift));
8826   effect(TEMP src);
8827   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8828   ins_encode %{
8829     int vector_len = 0;
8830     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8831   %}
8832   ins_pipe( pipe_slow );
8833 %}
8834 
8835 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
8836   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8837   match(Set dst (LShiftVS src shift));
8838   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8839   ins_encode %{
8840     int vector_len = 1;
8841     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8842   %}
8843   ins_pipe( pipe_slow );
8844 %}
8845 
8846 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
8847   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8848   match(Set dst (LShiftVS src shift));
8849   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8850   ins_encode %{
8851     int vector_len = 1;
8852     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8853   %}
8854   ins_pipe( pipe_slow );
8855 %}
8856 
8857 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
8858   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8859   match(Set dst (LShiftVS dst shift));
8860   effect(TEMP src);
8861   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8862   ins_encode %{
8863     int vector_len = 1;
8864     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8865   %}
8866   ins_pipe( pipe_slow );
8867 %}
8868 
8869 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
8870   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8871   match(Set dst (LShiftVS src shift));
8872   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8873   ins_encode %{
8874     int vector_len = 1;
8875     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8876   %}
8877   ins_pipe( pipe_slow );
8878 %}
8879 
8880 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
8881   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8882   match(Set dst (LShiftVS src shift));
8883   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8884   ins_encode %{
8885     int vector_len = 1;
8886     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8887   %}
8888   ins_pipe( pipe_slow );
8889 %}
8890 
8891 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
8892   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8893   match(Set dst (LShiftVS dst shift));
8894   effect(TEMP src);
8895   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8896   ins_encode %{
8897     int vector_len = 1;
8898     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8899   %}
8900   ins_pipe( pipe_slow );
8901 %}
8902 
8903 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8904   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8905   match(Set dst (LShiftVS src shift));
8906   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8907   ins_encode %{
8908     int vector_len = 2;
8909     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8910   %}
8911   ins_pipe( pipe_slow );
8912 %}
8913 
8914 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8915   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8916   match(Set dst (LShiftVS src shift));
8917   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8918   ins_encode %{
8919     int vector_len = 2;
8920     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8921   %}
8922   ins_pipe( pipe_slow );
8923 %}
8924 
8925 // Integers vector left shift
8926 instruct vsll2I(vecD dst, vecS shift) %{
8927   predicate(n->as_Vector()->length() == 2);
8928   match(Set dst (LShiftVI dst shift));
8929   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8930   ins_encode %{
8931     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8932   %}
8933   ins_pipe( pipe_slow );
8934 %}
8935 
8936 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8937   predicate(n->as_Vector()->length() == 2);
8938   match(Set dst (LShiftVI dst shift));
8939   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8940   ins_encode %{
8941     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8942   %}
8943   ins_pipe( pipe_slow );
8944 %}
8945 
8946 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8947   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8948   match(Set dst (LShiftVI src shift));
8949   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8950   ins_encode %{
8951     int vector_len = 0;
8952     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8953   %}
8954   ins_pipe( pipe_slow );
8955 %}
8956 
8957 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8958   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8959   match(Set dst (LShiftVI src shift));
8960   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8961   ins_encode %{
8962     int vector_len = 0;
8963     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8964   %}
8965   ins_pipe( pipe_slow );
8966 %}
8967 
8968 instruct vsll4I(vecX dst, vecS shift) %{
8969   predicate(n->as_Vector()->length() == 4);
8970   match(Set dst (LShiftVI dst shift));
8971   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8972   ins_encode %{
8973     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8974   %}
8975   ins_pipe( pipe_slow );
8976 %}
8977 
8978 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8979   predicate(n->as_Vector()->length() == 4);
8980   match(Set dst (LShiftVI dst shift));
8981   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8982   ins_encode %{
8983     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8984   %}
8985   ins_pipe( pipe_slow );
8986 %}
8987 
8988 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8989   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8990   match(Set dst (LShiftVI src shift));
8991   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8992   ins_encode %{
8993     int vector_len = 0;
8994     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8995   %}
8996   ins_pipe( pipe_slow );
8997 %}
8998 
8999 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9000   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9001   match(Set dst (LShiftVI src shift));
9002   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9003   ins_encode %{
9004     int vector_len = 0;
9005     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9006   %}
9007   ins_pipe( pipe_slow );
9008 %}
9009 
9010 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
9011   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9012   match(Set dst (LShiftVI src shift));
9013   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9014   ins_encode %{
9015     int vector_len = 1;
9016     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9017   %}
9018   ins_pipe( pipe_slow );
9019 %}
9020 
9021 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9022   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9023   match(Set dst (LShiftVI src shift));
9024   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9025   ins_encode %{
9026     int vector_len = 1;
9027     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9028   %}
9029   ins_pipe( pipe_slow );
9030 %}
9031 
9032 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
9033   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9034   match(Set dst (LShiftVI src shift));
9035   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9036   ins_encode %{
9037     int vector_len = 2;
9038     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9039   %}
9040   ins_pipe( pipe_slow );
9041 %}
9042 
9043 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9044   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9045   match(Set dst (LShiftVI src shift));
9046   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9047   ins_encode %{
9048     int vector_len = 2;
9049     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9050   %}
9051   ins_pipe( pipe_slow );
9052 %}
9053 
9054 // Longs vector left shift
9055 instruct vsll2L(vecX dst, vecS shift) %{
9056   predicate(n->as_Vector()->length() == 2);
9057   match(Set dst (LShiftVL dst shift));
9058   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9059   ins_encode %{
9060     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
9061   %}
9062   ins_pipe( pipe_slow );
9063 %}
9064 
9065 instruct vsll2L_imm(vecX dst, immI8 shift) %{
9066   predicate(n->as_Vector()->length() == 2);
9067   match(Set dst (LShiftVL dst shift));
9068   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9069   ins_encode %{
9070     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
9071   %}
9072   ins_pipe( pipe_slow );
9073 %}
9074 
9075 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
9076   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9077   match(Set dst (LShiftVL src shift));
9078   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9079   ins_encode %{
9080     int vector_len = 0;
9081     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9082   %}
9083   ins_pipe( pipe_slow );
9084 %}
9085 
9086 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9087   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9088   match(Set dst (LShiftVL src shift));
9089   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9090   ins_encode %{
9091     int vector_len = 0;
9092     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9093   %}
9094   ins_pipe( pipe_slow );
9095 %}
9096 
9097 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
9098   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9099   match(Set dst (LShiftVL src shift));
9100   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9101   ins_encode %{
9102     int vector_len = 1;
9103     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9104   %}
9105   ins_pipe( pipe_slow );
9106 %}
9107 
9108 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9109   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9110   match(Set dst (LShiftVL src shift));
9111   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9112   ins_encode %{
9113     int vector_len = 1;
9114     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9115   %}
9116   ins_pipe( pipe_slow );
9117 %}
9118 
9119 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9120   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9121   match(Set dst (LShiftVL src shift));
9122   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9123   ins_encode %{
9124     int vector_len = 2;
9125     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9126   %}
9127   ins_pipe( pipe_slow );
9128 %}
9129 
9130 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9131   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9132   match(Set dst (LShiftVL src shift));
9133   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9134   ins_encode %{
9135     int vector_len = 2;
9136     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9137   %}
9138   ins_pipe( pipe_slow );
9139 %}
9140 
9141 // ----------------------- LogicalRightShift -----------------------------------
9142 
9143 // Shorts vector logical right shift produces incorrect Java result
9144 // for negative data because java code convert short value into int with
9145 // sign extension before a shift. But char vectors are fine since chars are
9146 // unsigned values.
9147 
9148 instruct vsrl2S(vecS dst, vecS shift) %{
9149   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9150   match(Set dst (URShiftVS dst shift));
9151   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9152   ins_encode %{
9153     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9154   %}
9155   ins_pipe( pipe_slow );
9156 %}
9157 
9158 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9159   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9160   match(Set dst (URShiftVS dst shift));
9161   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9162   ins_encode %{
9163     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9164   %}
9165   ins_pipe( pipe_slow );
9166 %}
9167 
9168 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9169   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9170   match(Set dst (URShiftVS src shift));
9171   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9172   ins_encode %{
9173     int vector_len = 0;
9174     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9175   %}
9176   ins_pipe( pipe_slow );
9177 %}
9178 
9179 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9180   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9181   match(Set dst (URShiftVS src shift));
9182   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9183   ins_encode %{
9184     int vector_len = 0;
9185     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9186   %}
9187   ins_pipe( pipe_slow );
9188 %}
9189 
9190 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9191   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9192   match(Set dst (URShiftVS dst shift));
9193   effect(TEMP src);
9194   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9195   ins_encode %{
9196     int vector_len = 0;
9197     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9198   %}
9199   ins_pipe( pipe_slow );
9200 %}
9201 
9202 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9203   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9204   match(Set dst (URShiftVS src shift));
9205   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9206   ins_encode %{
9207     int vector_len = 0;
9208     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9209   %}
9210   ins_pipe( pipe_slow );
9211 %}
9212 
9213 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9214   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9215   match(Set dst (URShiftVS src shift));
9216   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9217   ins_encode %{
9218     int vector_len = 0;
9219     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9220   %}
9221   ins_pipe( pipe_slow );
9222 %}
9223 
9224 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9225   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9226   match(Set dst (URShiftVS dst shift));
9227   effect(TEMP src);
9228   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9229   ins_encode %{
9230     int vector_len = 0;
9231     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9232   %}
9233   ins_pipe( pipe_slow );
9234 %}
9235 
9236 instruct vsrl4S(vecD dst, vecS shift) %{
9237   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9238   match(Set dst (URShiftVS dst shift));
9239   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9240   ins_encode %{
9241     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9242   %}
9243   ins_pipe( pipe_slow );
9244 %}
9245 
9246 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9247   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9248   match(Set dst (URShiftVS dst shift));
9249   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9250   ins_encode %{
9251     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9252   %}
9253   ins_pipe( pipe_slow );
9254 %}
9255 
9256 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9257   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9258   match(Set dst (URShiftVS src shift));
9259   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9260   ins_encode %{
9261     int vector_len = 0;
9262     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9263   %}
9264   ins_pipe( pipe_slow );
9265 %}
9266 
9267 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9268   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9269   match(Set dst (URShiftVS src shift));
9270   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9271   ins_encode %{
9272     int vector_len = 0;
9273     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9274   %}
9275   ins_pipe( pipe_slow );
9276 %}
9277 
9278 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9279   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9280   match(Set dst (URShiftVS dst shift));
9281   effect(TEMP src);
9282   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9283   ins_encode %{
9284     int vector_len = 0;
9285     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9286   %}
9287   ins_pipe( pipe_slow );
9288 %}
9289 
9290 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9291   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9292   match(Set dst (URShiftVS src shift));
9293   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9294   ins_encode %{
9295     int vector_len = 0;
9296     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9297   %}
9298   ins_pipe( pipe_slow );
9299 %}
9300 
9301 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9302   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9303   match(Set dst (URShiftVS src shift));
9304   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9305   ins_encode %{
9306     int vector_len = 0;
9307     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9313   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9314   match(Set dst (URShiftVS dst shift));
9315   effect(TEMP src);
9316   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9317   ins_encode %{
9318     int vector_len = 0;
9319     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9320   %}
9321   ins_pipe( pipe_slow );
9322 %}
9323 
9324 instruct vsrl8S(vecX dst, vecS shift) %{
9325   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9326   match(Set dst (URShiftVS dst shift));
9327   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9328   ins_encode %{
9329     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9330   %}
9331   ins_pipe( pipe_slow );
9332 %}
9333 
9334 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9335   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9336   match(Set dst (URShiftVS dst shift));
9337   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9338   ins_encode %{
9339     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9340   %}
9341   ins_pipe( pipe_slow );
9342 %}
9343 
9344 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9345   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9346   match(Set dst (URShiftVS src shift));
9347   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9348   ins_encode %{
9349     int vector_len = 0;
9350     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9351   %}
9352   ins_pipe( pipe_slow );
9353 %}
9354 
9355 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9356   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9357   match(Set dst (URShiftVS src shift));
9358   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9359   ins_encode %{
9360     int vector_len = 0;
9361     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9362   %}
9363   ins_pipe( pipe_slow );
9364 %}
9365 
9366 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9367   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9368   match(Set dst (URShiftVS dst shift));
9369   effect(TEMP src);
9370   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9371   ins_encode %{
9372     int vector_len = 0;
9373     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9374   %}
9375   ins_pipe( pipe_slow );
9376 %}
9377 
9378 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9379   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9380   match(Set dst (URShiftVS src shift));
9381   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9382   ins_encode %{
9383     int vector_len = 0;
9384     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9385   %}
9386   ins_pipe( pipe_slow );
9387 %}
9388 
9389 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9390   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9391   match(Set dst (URShiftVS src shift));
9392   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9393   ins_encode %{
9394     int vector_len = 0;
9395     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9396   %}
9397   ins_pipe( pipe_slow );
9398 %}
9399 
9400 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9401   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9402   match(Set dst (URShiftVS dst shift));
9403   effect(TEMP src);
9404   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9405   ins_encode %{
9406     int vector_len = 0;
9407     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9408   %}
9409   ins_pipe( pipe_slow );
9410 %}
9411 
9412 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9413   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9414   match(Set dst (URShiftVS src shift));
9415   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9416   ins_encode %{
9417     int vector_len = 1;
9418     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9419   %}
9420   ins_pipe( pipe_slow );
9421 %}
9422 
9423 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9424   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9425   match(Set dst (URShiftVS src shift));
9426   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9427   ins_encode %{
9428     int vector_len = 1;
9429     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9430   %}
9431   ins_pipe( pipe_slow );
9432 %}
9433 
9434 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9435   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9436   match(Set dst (URShiftVS dst shift));
9437   effect(TEMP src);
9438   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9439   ins_encode %{
9440     int vector_len = 1;
9441     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9442   %}
9443   ins_pipe( pipe_slow );
9444 %}
9445 
9446 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9447   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9448   match(Set dst (URShiftVS src shift));
9449   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9450   ins_encode %{
9451     int vector_len = 1;
9452     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9453   %}
9454   ins_pipe( pipe_slow );
9455 %}
9456 
9457 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9458   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9459   match(Set dst (URShiftVS src shift));
9460   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9461   ins_encode %{
9462     int vector_len = 1;
9463     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9464   %}
9465   ins_pipe( pipe_slow );
9466 %}
9467 
9468 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9469   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9470   match(Set dst (URShiftVS dst shift));
9471   effect(TEMP src);
9472   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9473   ins_encode %{
9474     int vector_len = 1;
9475     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9476   %}
9477   ins_pipe( pipe_slow );
9478 %}
9479 
9480 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9481   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9482   match(Set dst (URShiftVS src shift));
9483   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9484   ins_encode %{
9485     int vector_len = 2;
9486     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9487   %}
9488   ins_pipe( pipe_slow );
9489 %}
9490 
9491 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9492   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9493   match(Set dst (URShiftVS src shift));
9494   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9495   ins_encode %{
9496     int vector_len = 2;
9497     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9498   %}
9499   ins_pipe( pipe_slow );
9500 %}
9501 
9502 // Integers vector logical right shift
9503 instruct vsrl2I(vecD dst, vecS shift) %{
9504   predicate(n->as_Vector()->length() == 2);
9505   match(Set dst (URShiftVI dst shift));
9506   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9507   ins_encode %{
9508     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9509   %}
9510   ins_pipe( pipe_slow );
9511 %}
9512 
9513 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
9514   predicate(n->as_Vector()->length() == 2);
9515   match(Set dst (URShiftVI dst shift));
9516   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9517   ins_encode %{
9518     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9519   %}
9520   ins_pipe( pipe_slow );
9521 %}
9522 
9523 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
9524   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9525   match(Set dst (URShiftVI src shift));
9526   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9527   ins_encode %{
9528     int vector_len = 0;
9529     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9530   %}
9531   ins_pipe( pipe_slow );
9532 %}
9533 
9534 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9535   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9536   match(Set dst (URShiftVI src shift));
9537   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9538   ins_encode %{
9539     int vector_len = 0;
9540     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9541   %}
9542   ins_pipe( pipe_slow );
9543 %}
9544 
9545 instruct vsrl4I(vecX dst, vecS shift) %{
9546   predicate(n->as_Vector()->length() == 4);
9547   match(Set dst (URShiftVI dst shift));
9548   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9549   ins_encode %{
9550     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9551   %}
9552   ins_pipe( pipe_slow );
9553 %}
9554 
9555 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
9556   predicate(n->as_Vector()->length() == 4);
9557   match(Set dst (URShiftVI dst shift));
9558   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9559   ins_encode %{
9560     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9561   %}
9562   ins_pipe( pipe_slow );
9563 %}
9564 
9565 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
9566   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9567   match(Set dst (URShiftVI src shift));
9568   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9569   ins_encode %{
9570     int vector_len = 0;
9571     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9572   %}
9573   ins_pipe( pipe_slow );
9574 %}
9575 
9576 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9577   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9578   match(Set dst (URShiftVI src shift));
9579   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9580   ins_encode %{
9581     int vector_len = 0;
9582     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9583   %}
9584   ins_pipe( pipe_slow );
9585 %}
9586 
9587 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
9588   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9589   match(Set dst (URShiftVI src shift));
9590   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9591   ins_encode %{
9592     int vector_len = 1;
9593     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9594   %}
9595   ins_pipe( pipe_slow );
9596 %}
9597 
9598 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9599   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9600   match(Set dst (URShiftVI src shift));
9601   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9602   ins_encode %{
9603     int vector_len = 1;
9604     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9605   %}
9606   ins_pipe( pipe_slow );
9607 %}
9608 
9609 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
9610   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9611   match(Set dst (URShiftVI src shift));
9612   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9613   ins_encode %{
9614     int vector_len = 2;
9615     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9621   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9622   match(Set dst (URShiftVI src shift));
9623   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9624   ins_encode %{
9625     int vector_len = 2;
9626     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9627   %}
9628   ins_pipe( pipe_slow );
9629 %}
9630 
9631 // Longs vector logical right shift
9632 instruct vsrl2L(vecX dst, vecS shift) %{
9633   predicate(n->as_Vector()->length() == 2);
9634   match(Set dst (URShiftVL dst shift));
9635   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9636   ins_encode %{
9637     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
9638   %}
9639   ins_pipe( pipe_slow );
9640 %}
9641 
9642 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9643   predicate(n->as_Vector()->length() == 2);
9644   match(Set dst (URShiftVL dst shift));
9645   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9646   ins_encode %{
9647     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9648   %}
9649   ins_pipe( pipe_slow );
9650 %}
9651 
9652 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9653   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9654   match(Set dst (URShiftVL src shift));
9655   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9656   ins_encode %{
9657     int vector_len = 0;
9658     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9659   %}
9660   ins_pipe( pipe_slow );
9661 %}
9662 
9663 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9664   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9665   match(Set dst (URShiftVL src shift));
9666   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9667   ins_encode %{
9668     int vector_len = 0;
9669     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9670   %}
9671   ins_pipe( pipe_slow );
9672 %}
9673 
9674 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9675   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9676   match(Set dst (URShiftVL src shift));
9677   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9678   ins_encode %{
9679     int vector_len = 1;
9680     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9681   %}
9682   ins_pipe( pipe_slow );
9683 %}
9684 
9685 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9686   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9687   match(Set dst (URShiftVL src shift));
9688   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9689   ins_encode %{
9690     int vector_len = 1;
9691     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9692   %}
9693   ins_pipe( pipe_slow );
9694 %}
9695 
9696 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9697   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9698   match(Set dst (URShiftVL src shift));
9699   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9700   ins_encode %{
9701     int vector_len = 2;
9702     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9703   %}
9704   ins_pipe( pipe_slow );
9705 %}
9706 
9707 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9708   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9709   match(Set dst (URShiftVL src shift));
9710   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9711   ins_encode %{
9712     int vector_len = 2;
9713     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9714   %}
9715   ins_pipe( pipe_slow );
9716 %}
9717 
9718 // ------------------- ArithmeticRightShift -----------------------------------
9719 
9720 // Shorts/Chars vector arithmetic right shift
9721 instruct vsra2S(vecS dst, vecS shift) %{
9722   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9723   match(Set dst (RShiftVS dst shift));
9724   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9725   ins_encode %{
9726     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9727   %}
9728   ins_pipe( pipe_slow );
9729 %}
9730 
9731 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9732   predicate(n->as_Vector()->length() == 2);
9733   match(Set dst (RShiftVS dst shift));
9734   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9735   ins_encode %{
9736     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9737   %}
9738   ins_pipe( pipe_slow );
9739 %}
9740 
9741 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9742   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9743   match(Set dst (RShiftVS src shift));
9744   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9745   ins_encode %{
9746     int vector_len = 0;
9747     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9748   %}
9749   ins_pipe( pipe_slow );
9750 %}
9751 
9752 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9753   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9754   match(Set dst (RShiftVS src shift));
9755   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9756   ins_encode %{
9757     int vector_len = 0;
9758     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9759   %}
9760   ins_pipe( pipe_slow );
9761 %}
9762 
9763 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9764   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9765   match(Set dst (RShiftVS dst shift));
9766   effect(TEMP src);
9767   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9768   ins_encode %{
9769     int vector_len = 0;
9770     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9771   %}
9772   ins_pipe( pipe_slow );
9773 %}
9774 
9775 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9776   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9777   match(Set dst (RShiftVS src shift));
9778   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9779   ins_encode %{
9780     int vector_len = 0;
9781     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9782   %}
9783   ins_pipe( pipe_slow );
9784 %}
9785 
9786 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9787   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9788   match(Set dst (RShiftVS src shift));
9789   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9790   ins_encode %{
9791     int vector_len = 0;
9792     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9793   %}
9794   ins_pipe( pipe_slow );
9795 %}
9796 
9797 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9798   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9799   match(Set dst (RShiftVS dst shift));
9800   effect(TEMP src);
9801   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9802   ins_encode %{
9803     int vector_len = 0;
9804     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9805   %}
9806   ins_pipe( pipe_slow );
9807 %}
9808 
9809 instruct vsra4S(vecD dst, vecS shift) %{
9810   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9811   match(Set dst (RShiftVS dst shift));
9812   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9813   ins_encode %{
9814     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9815   %}
9816   ins_pipe( pipe_slow );
9817 %}
9818 
9819 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9820   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9821   match(Set dst (RShiftVS dst shift));
9822   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9823   ins_encode %{
9824     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9825   %}
9826   ins_pipe( pipe_slow );
9827 %}
9828 
9829 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9830   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9831   match(Set dst (RShiftVS src shift));
9832   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9833   ins_encode %{
9834     int vector_len = 0;
9835     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9836   %}
9837   ins_pipe( pipe_slow );
9838 %}
9839 
9840 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9841   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9842   match(Set dst (RShiftVS src shift));
9843   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9844   ins_encode %{
9845     int vector_len = 0;
9846     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9847   %}
9848   ins_pipe( pipe_slow );
9849 %}
9850 
9851 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9852   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9853   match(Set dst (RShiftVS dst shift));
9854   effect(TEMP src);
9855   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9856   ins_encode %{
9857     int vector_len = 0;
9858     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9859   %}
9860   ins_pipe( pipe_slow );
9861 %}
9862 
9863 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9864   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9865   match(Set dst (RShiftVS src shift));
9866   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9867   ins_encode %{
9868     int vector_len = 0;
9869     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9870   %}
9871   ins_pipe( pipe_slow );
9872 %}
9873 
9874 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9875   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9876   match(Set dst (RShiftVS src shift));
9877   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9878   ins_encode %{
9879     int vector_len = 0;
9880     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9881   %}
9882   ins_pipe( pipe_slow );
9883 %}
9884 
9885 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9886   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9887   match(Set dst (RShiftVS dst shift));
9888   effect(TEMP src);
9889   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9890   ins_encode %{
9891     int vector_len = 0;
9892     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9893   %}
9894   ins_pipe( pipe_slow );
9895 %}
9896 
9897 instruct vsra8S(vecX dst, vecS shift) %{
9898   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9899   match(Set dst (RShiftVS dst shift));
9900   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9901   ins_encode %{
9902     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9903   %}
9904   ins_pipe( pipe_slow );
9905 %}
9906 
9907 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9908   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9909   match(Set dst (RShiftVS dst shift));
9910   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9911   ins_encode %{
9912     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9913   %}
9914   ins_pipe( pipe_slow );
9915 %}
9916 
9917 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9918   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9919   match(Set dst (RShiftVS src shift));
9920   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9921   ins_encode %{
9922     int vector_len = 0;
9923     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9924   %}
9925   ins_pipe( pipe_slow );
9926 %}
9927 
9928 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9929   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9930   match(Set dst (RShiftVS src shift));
9931   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9932   ins_encode %{
9933     int vector_len = 0;
9934     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9935   %}
9936   ins_pipe( pipe_slow );
9937 %}
9938 
9939 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9940   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9941   match(Set dst (RShiftVS dst shift));
9942   effect(TEMP src);
9943   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9944   ins_encode %{
9945     int vector_len = 0;
9946     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9947   %}
9948   ins_pipe( pipe_slow );
9949 %}
9950 
9951 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9952   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9953   match(Set dst (RShiftVS src shift));
9954   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9955   ins_encode %{
9956     int vector_len = 0;
9957     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9958   %}
9959   ins_pipe( pipe_slow );
9960 %}
9961 
9962 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9963   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9964   match(Set dst (RShiftVS src shift));
9965   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9966   ins_encode %{
9967     int vector_len = 0;
9968     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9969   %}
9970   ins_pipe( pipe_slow );
9971 %}
9972 
9973 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9974   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9975   match(Set dst (RShiftVS dst shift));
9976   effect(TEMP src);
9977   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9978   ins_encode %{
9979     int vector_len = 0;
9980     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9981   %}
9982   ins_pipe( pipe_slow );
9983 %}
9984 
9985 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9986   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9987   match(Set dst (RShiftVS src shift));
9988   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9989   ins_encode %{
9990     int vector_len = 1;
9991     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9992   %}
9993   ins_pipe( pipe_slow );
9994 %}
9995 
9996 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9997   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9998   match(Set dst (RShiftVS src shift));
9999   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10000   ins_encode %{
10001     int vector_len = 1;
10002     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10003   %}
10004   ins_pipe( pipe_slow );
10005 %}
10006 
10007 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10008   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10009   match(Set dst (RShiftVS dst shift));
10010   effect(TEMP src);
10011   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10012   ins_encode %{
10013     int vector_len = 1;
10014     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10015   %}
10016   ins_pipe( pipe_slow );
10017 %}
10018 
10019 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10020   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
10021   match(Set dst (RShiftVS src shift));
10022   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10023   ins_encode %{
10024     int vector_len = 1;
10025     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10026   %}
10027   ins_pipe( pipe_slow );
10028 %}
10029 
10030 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10031   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10032   match(Set dst (RShiftVS src shift));
10033   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10034   ins_encode %{
10035     int vector_len = 1;
10036     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10037   %}
10038   ins_pipe( pipe_slow );
10039 %}
10040 
10041 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10042   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10043   match(Set dst (RShiftVS dst shift));
10044   effect(TEMP src);
10045   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10046   ins_encode %{
10047     int vector_len = 1;
10048     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10049   %}
10050   ins_pipe( pipe_slow );
10051 %}
10052 
10053 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
10054   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10055   match(Set dst (RShiftVS src shift));
10056   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10057   ins_encode %{
10058     int vector_len = 2;
10059     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10060   %}
10061   ins_pipe( pipe_slow );
10062 %}
10063 
10064 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10065   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10066   match(Set dst (RShiftVS src shift));
10067   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10068   ins_encode %{
10069     int vector_len = 2;
10070     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10071   %}
10072   ins_pipe( pipe_slow );
10073 %}
10074 
10075 // Integers vector arithmetic right shift
10076 instruct vsra2I(vecD dst, vecS shift) %{
10077   predicate(n->as_Vector()->length() == 2);
10078   match(Set dst (RShiftVI dst shift));
10079   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10080   ins_encode %{
10081     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10082   %}
10083   ins_pipe( pipe_slow );
10084 %}
10085 
10086 instruct vsra2I_imm(vecD dst, immI8 shift) %{
10087   predicate(n->as_Vector()->length() == 2);
10088   match(Set dst (RShiftVI dst shift));
10089   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10090   ins_encode %{
10091     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10092   %}
10093   ins_pipe( pipe_slow );
10094 %}
10095 
10096 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
10097   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10098   match(Set dst (RShiftVI src shift));
10099   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10100   ins_encode %{
10101     int vector_len = 0;
10102     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10103   %}
10104   ins_pipe( pipe_slow );
10105 %}
10106 
10107 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
10108   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10109   match(Set dst (RShiftVI src shift));
10110   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10111   ins_encode %{
10112     int vector_len = 0;
10113     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10114   %}
10115   ins_pipe( pipe_slow );
10116 %}
10117 
10118 instruct vsra4I(vecX dst, vecS shift) %{
10119   predicate(n->as_Vector()->length() == 4);
10120   match(Set dst (RShiftVI dst shift));
10121   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10122   ins_encode %{
10123     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 
10128 instruct vsra4I_imm(vecX dst, immI8 shift) %{
10129   predicate(n->as_Vector()->length() == 4);
10130   match(Set dst (RShiftVI dst shift));
10131   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10132   ins_encode %{
10133     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10134   %}
10135   ins_pipe( pipe_slow );
10136 %}
10137 
10138 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
10139   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10140   match(Set dst (RShiftVI src shift));
10141   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10142   ins_encode %{
10143     int vector_len = 0;
10144     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10145   %}
10146   ins_pipe( pipe_slow );
10147 %}
10148 
10149 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10150   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10151   match(Set dst (RShiftVI src shift));
10152   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10153   ins_encode %{
10154     int vector_len = 0;
10155     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10156   %}
10157   ins_pipe( pipe_slow );
10158 %}
10159 
10160 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
10161   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10162   match(Set dst (RShiftVI src shift));
10163   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10164   ins_encode %{
10165     int vector_len = 1;
10166     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10167   %}
10168   ins_pipe( pipe_slow );
10169 %}
10170 
10171 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10172   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10173   match(Set dst (RShiftVI src shift));
10174   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10175   ins_encode %{
10176     int vector_len = 1;
10177     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10178   %}
10179   ins_pipe( pipe_slow );
10180 %}
10181 
10182 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
10183   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10184   match(Set dst (RShiftVI src shift));
10185   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10186   ins_encode %{
10187     int vector_len = 2;
10188     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10189   %}
10190   ins_pipe( pipe_slow );
10191 %}
10192 
10193 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10194   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10195   match(Set dst (RShiftVI src shift));
10196   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10197   ins_encode %{
10198     int vector_len = 2;
10199     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10200   %}
10201   ins_pipe( pipe_slow );
10202 %}
10203 
10204 // There are no longs vector arithmetic right shift instructions.
10205 
10206 
10207 // --------------------------------- AND --------------------------------------
10208 
10209 instruct vand4B(vecS dst, vecS src) %{
10210   predicate(n->as_Vector()->length_in_bytes() == 4);
10211   match(Set dst (AndV dst src));
10212   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
10213   ins_encode %{
10214     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10215   %}
10216   ins_pipe( pipe_slow );
10217 %}
10218 
10219 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
10220   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10221   match(Set dst (AndV src1 src2));
10222   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
10223   ins_encode %{
10224     int vector_len = 0;
10225     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10226   %}
10227   ins_pipe( pipe_slow );
10228 %}
10229 
10230 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
10231   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10232   match(Set dst (AndV src (LoadVector mem)));
10233   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
10234   ins_encode %{
10235     int vector_len = 0;
10236     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10237   %}
10238   ins_pipe( pipe_slow );
10239 %}
10240 
10241 instruct vand8B(vecD dst, vecD src) %{
10242   predicate(n->as_Vector()->length_in_bytes() == 8);
10243   match(Set dst (AndV dst src));
10244   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
10245   ins_encode %{
10246     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10247   %}
10248   ins_pipe( pipe_slow );
10249 %}
10250 
10251 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
10252   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10253   match(Set dst (AndV src1 src2));
10254   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
10255   ins_encode %{
10256     int vector_len = 0;
10257     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10258   %}
10259   ins_pipe( pipe_slow );
10260 %}
10261 
10262 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
10263   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10264   match(Set dst (AndV src (LoadVector mem)));
10265   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
10266   ins_encode %{
10267     int vector_len = 0;
10268     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10269   %}
10270   ins_pipe( pipe_slow );
10271 %}
10272 
10273 instruct vand16B(vecX dst, vecX src) %{
10274   predicate(n->as_Vector()->length_in_bytes() == 16);
10275   match(Set dst (AndV dst src));
10276   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
10277   ins_encode %{
10278     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10279   %}
10280   ins_pipe( pipe_slow );
10281 %}
10282 
10283 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
10284   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10285   match(Set dst (AndV src1 src2));
10286   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
10287   ins_encode %{
10288     int vector_len = 0;
10289     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10290   %}
10291   ins_pipe( pipe_slow );
10292 %}
10293 
10294 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
10295   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10296   match(Set dst (AndV src (LoadVector mem)));
10297   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
10298   ins_encode %{
10299     int vector_len = 0;
10300     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10301   %}
10302   ins_pipe( pipe_slow );
10303 %}
10304 
10305 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
10306   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10307   match(Set dst (AndV src1 src2));
10308   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
10309   ins_encode %{
10310     int vector_len = 1;
10311     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10312   %}
10313   ins_pipe( pipe_slow );
10314 %}
10315 
10316 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
10317   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10318   match(Set dst (AndV src (LoadVector mem)));
10319   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
10320   ins_encode %{
10321     int vector_len = 1;
10322     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10323   %}
10324   ins_pipe( pipe_slow );
10325 %}
10326 
10327 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10328   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10329   match(Set dst (AndV src1 src2));
10330   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
10331   ins_encode %{
10332     int vector_len = 2;
10333     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10334   %}
10335   ins_pipe( pipe_slow );
10336 %}
10337 
10338 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
10339   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10340   match(Set dst (AndV src (LoadVector mem)));
10341   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
10342   ins_encode %{
10343     int vector_len = 2;
10344     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10345   %}
10346   ins_pipe( pipe_slow );
10347 %}
10348 
10349 // --------------------------------- OR ---------------------------------------
10350 
10351 instruct vor4B(vecS dst, vecS src) %{
10352   predicate(n->as_Vector()->length_in_bytes() == 4);
10353   match(Set dst (OrV dst src));
10354   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
10355   ins_encode %{
10356     __ por($dst$$XMMRegister, $src$$XMMRegister);
10357   %}
10358   ins_pipe( pipe_slow );
10359 %}
10360 
10361 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
10362   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10363   match(Set dst (OrV src1 src2));
10364   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
10365   ins_encode %{
10366     int vector_len = 0;
10367     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10368   %}
10369   ins_pipe( pipe_slow );
10370 %}
10371 
10372 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
10373   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10374   match(Set dst (OrV src (LoadVector mem)));
10375   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
10376   ins_encode %{
10377     int vector_len = 0;
10378     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10379   %}
10380   ins_pipe( pipe_slow );
10381 %}
10382 
10383 instruct vor8B(vecD dst, vecD src) %{
10384   predicate(n->as_Vector()->length_in_bytes() == 8);
10385   match(Set dst (OrV dst src));
10386   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
10387   ins_encode %{
10388     __ por($dst$$XMMRegister, $src$$XMMRegister);
10389   %}
10390   ins_pipe( pipe_slow );
10391 %}
10392 
10393 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
10394   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10395   match(Set dst (OrV src1 src2));
10396   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
10397   ins_encode %{
10398     int vector_len = 0;
10399     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10400   %}
10401   ins_pipe( pipe_slow );
10402 %}
10403 
10404 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
10405   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10406   match(Set dst (OrV src (LoadVector mem)));
10407   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
10408   ins_encode %{
10409     int vector_len = 0;
10410     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10411   %}
10412   ins_pipe( pipe_slow );
10413 %}
10414 
10415 instruct vor16B(vecX dst, vecX src) %{
10416   predicate(n->as_Vector()->length_in_bytes() == 16);
10417   match(Set dst (OrV dst src));
10418   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
10419   ins_encode %{
10420     __ por($dst$$XMMRegister, $src$$XMMRegister);
10421   %}
10422   ins_pipe( pipe_slow );
10423 %}
10424 
10425 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
10426   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10427   match(Set dst (OrV src1 src2));
10428   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
10429   ins_encode %{
10430     int vector_len = 0;
10431     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10432   %}
10433   ins_pipe( pipe_slow );
10434 %}
10435 
10436 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
10437   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10438   match(Set dst (OrV src (LoadVector mem)));
10439   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
10440   ins_encode %{
10441     int vector_len = 0;
10442     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10443   %}
10444   ins_pipe( pipe_slow );
10445 %}
10446 
10447 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
10448   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10449   match(Set dst (OrV src1 src2));
10450   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
10451   ins_encode %{
10452     int vector_len = 1;
10453     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10454   %}
10455   ins_pipe( pipe_slow );
10456 %}
10457 
10458 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
10459   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10460   match(Set dst (OrV src (LoadVector mem)));
10461   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
10462   ins_encode %{
10463     int vector_len = 1;
10464     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10465   %}
10466   ins_pipe( pipe_slow );
10467 %}
10468 
10469 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10470   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10471   match(Set dst (OrV src1 src2));
10472   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
10473   ins_encode %{
10474     int vector_len = 2;
10475     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10476   %}
10477   ins_pipe( pipe_slow );
10478 %}
10479 
10480 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
10481   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10482   match(Set dst (OrV src (LoadVector mem)));
10483   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
10484   ins_encode %{
10485     int vector_len = 2;
10486     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10487   %}
10488   ins_pipe( pipe_slow );
10489 %}
10490 
10491 // --------------------------------- XOR --------------------------------------
10492 
10493 instruct vxor4B(vecS dst, vecS src) %{
10494   predicate(n->as_Vector()->length_in_bytes() == 4);
10495   match(Set dst (XorV dst src));
10496   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
10497   ins_encode %{
10498     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10499   %}
10500   ins_pipe( pipe_slow );
10501 %}
10502 
10503 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
10504   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10505   match(Set dst (XorV src1 src2));
10506   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
10507   ins_encode %{
10508     int vector_len = 0;
10509     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10510   %}
10511   ins_pipe( pipe_slow );
10512 %}
10513 
10514 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
10515   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10516   match(Set dst (XorV src (LoadVector mem)));
10517   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
10518   ins_encode %{
10519     int vector_len = 0;
10520     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10521   %}
10522   ins_pipe( pipe_slow );
10523 %}
10524 
10525 instruct vxor8B(vecD dst, vecD src) %{
10526   predicate(n->as_Vector()->length_in_bytes() == 8);
10527   match(Set dst (XorV dst src));
10528   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
10529   ins_encode %{
10530     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10531   %}
10532   ins_pipe( pipe_slow );
10533 %}
10534 
10535 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
10536   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10537   match(Set dst (XorV src1 src2));
10538   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
10539   ins_encode %{
10540     int vector_len = 0;
10541     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10542   %}
10543   ins_pipe( pipe_slow );
10544 %}
10545 
10546 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
10547   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10548   match(Set dst (XorV src (LoadVector mem)));
10549   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
10550   ins_encode %{
10551     int vector_len = 0;
10552     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10553   %}
10554   ins_pipe( pipe_slow );
10555 %}
10556 
10557 instruct vxor16B(vecX dst, vecX src) %{
10558   predicate(n->as_Vector()->length_in_bytes() == 16);
10559   match(Set dst (XorV dst src));
10560   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
10561   ins_encode %{
10562     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10563   %}
10564   ins_pipe( pipe_slow );
10565 %}
10566 
10567 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
10568   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10569   match(Set dst (XorV src1 src2));
10570   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
10571   ins_encode %{
10572     int vector_len = 0;
10573     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10574   %}
10575   ins_pipe( pipe_slow );
10576 %}
10577 
10578 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
10579   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10580   match(Set dst (XorV src (LoadVector mem)));
10581   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
10582   ins_encode %{
10583     int vector_len = 0;
10584     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10585   %}
10586   ins_pipe( pipe_slow );
10587 %}
10588 
10589 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
10590   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10591   match(Set dst (XorV src1 src2));
10592   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
10593   ins_encode %{
10594     int vector_len = 1;
10595     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10596   %}
10597   ins_pipe( pipe_slow );
10598 %}
10599 
10600 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
10601   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10602   match(Set dst (XorV src (LoadVector mem)));
10603   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
10604   ins_encode %{
10605     int vector_len = 1;
10606     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10607   %}
10608   ins_pipe( pipe_slow );
10609 %}
10610 
10611 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10612   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10613   match(Set dst (XorV src1 src2));
10614   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
10615   ins_encode %{
10616     int vector_len = 2;
10617     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10618   %}
10619   ins_pipe( pipe_slow );
10620 %}
10621 
10622 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
10623   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10624   match(Set dst (XorV src (LoadVector mem)));
10625   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
10626   ins_encode %{
10627     int vector_len = 2;
10628     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10629   %}
10630   ins_pipe( pipe_slow );
10631 %}
10632 
10633 // --------------------------------- FMA --------------------------------------
10634 
10635 // a * b + c
10636 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
10637   predicate(UseFMA && n->as_Vector()->length() == 2);
10638   match(Set c (FmaVD  c (Binary a b)));
10639   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10640   ins_cost(150);
10641   ins_encode %{
10642     int vector_len = 0;
10643     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10644   %}
10645   ins_pipe( pipe_slow );
10646 %}
10647 
10648 // a * b + c
10649 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
10650   predicate(UseFMA && n->as_Vector()->length() == 2);
10651   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10652   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10653   ins_cost(150);
10654   ins_encode %{
10655     int vector_len = 0;
10656     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10657   %}
10658   ins_pipe( pipe_slow );
10659 %}
10660 
10661 
10662 // a * b + c
10663 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
10664   predicate(UseFMA && n->as_Vector()->length() == 4);
10665   match(Set c (FmaVD  c (Binary a b)));
10666   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10667   ins_cost(150);
10668   ins_encode %{
10669     int vector_len = 1;
10670     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10671   %}
10672   ins_pipe( pipe_slow );
10673 %}
10674 
10675 // a * b + c
10676 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
10677   predicate(UseFMA && n->as_Vector()->length() == 4);
10678   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10679   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10680   ins_cost(150);
10681   ins_encode %{
10682     int vector_len = 1;
10683     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10684   %}
10685   ins_pipe( pipe_slow );
10686 %}
10687 
10688 // a * b + c
10689 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
10690   predicate(UseFMA && n->as_Vector()->length() == 8);
10691   match(Set c (FmaVD  c (Binary a b)));
10692   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10693   ins_cost(150);
10694   ins_encode %{
10695     int vector_len = 2;
10696     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10697   %}
10698   ins_pipe( pipe_slow );
10699 %}
10700 
10701 // a * b + c
10702 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
10703   predicate(UseFMA && n->as_Vector()->length() == 8);
10704   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10705   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10706   ins_cost(150);
10707   ins_encode %{
10708     int vector_len = 2;
10709     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10710   %}
10711   ins_pipe( pipe_slow );
10712 %}
10713 
10714 // a * b + c
10715 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
10716   predicate(UseFMA && n->as_Vector()->length() == 4);
10717   match(Set c (FmaVF  c (Binary a b)));
10718   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10719   ins_cost(150);
10720   ins_encode %{
10721     int vector_len = 0;
10722     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10723   %}
10724   ins_pipe( pipe_slow );
10725 %}
10726 
10727 // a * b + c
10728 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
10729   predicate(UseFMA && n->as_Vector()->length() == 4);
10730   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10731   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10732   ins_cost(150);
10733   ins_encode %{
10734     int vector_len = 0;
10735     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10736   %}
10737   ins_pipe( pipe_slow );
10738 %}
10739 
10740 // a * b + c
10741 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
10742   predicate(UseFMA && n->as_Vector()->length() == 8);
10743   match(Set c (FmaVF  c (Binary a b)));
10744   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10745   ins_cost(150);
10746   ins_encode %{
10747     int vector_len = 1;
10748     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10749   %}
10750   ins_pipe( pipe_slow );
10751 %}
10752 
10753 // a * b + c
10754 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
10755   predicate(UseFMA && n->as_Vector()->length() == 8);
10756   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10757   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10758   ins_cost(150);
10759   ins_encode %{
10760     int vector_len = 1;
10761     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10762   %}
10763   ins_pipe( pipe_slow );
10764 %}
10765 
10766 // a * b + c
10767 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
10768   predicate(UseFMA && n->as_Vector()->length() == 16);
10769   match(Set c (FmaVF  c (Binary a b)));
10770   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10771   ins_cost(150);
10772   ins_encode %{
10773     int vector_len = 2;
10774     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10775   %}
10776   ins_pipe( pipe_slow );
10777 %}
10778 
10779 // a * b + c
10780 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
10781   predicate(UseFMA && n->as_Vector()->length() == 16);
10782   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10783   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10784   ins_cost(150);
10785   ins_encode %{
10786     int vector_len = 2;
10787     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10788   %}
10789   ins_pipe( pipe_slow );
10790 %}